use transactions in feed update process, better handle feeds without entry dates (schema updated)

This commit is contained in:
Andrew Dolgov 2005-08-23 06:46:48 +01:00
parent 9d9c9dd645
commit b82af8c382
3 changed files with 55 additions and 23 deletions

View File

@ -16,6 +16,14 @@
if ($op == "rpc") { if ($op == "rpc") {
$subop = $_GET["subop"]; $subop = $_GET["subop"];
if ($subop == "forceUpdateAll") {
update_all_feeds($link, true);
}
if ($subop == "updateAll") {
update_all_feeds($link, false);
}
if ($subop == "catchupPage") { if ($subop == "catchupPage") {

View File

@ -3,6 +3,8 @@
function update_all_feeds($link, $fetch) { function update_all_feeds($link, $fetch) {
pg_query("BEGIN");
if (!$fetch) { if (!$fetch) {
$result = pg_query($link, "SELECT feed_url,id FROM ttrss_feeds WHERE $result = pg_query($link, "SELECT feed_url,id FROM ttrss_feeds WHERE
@ -21,6 +23,8 @@
$num_unread += update_rss_feed($link, $line["feed_url"], $line["id"]); $num_unread += update_rss_feed($link, $line["feed_url"], $line["id"]);
} }
pg_query("COMMIT");
} }
function update_rss_feed($link, $feed_url, $feed) { function update_rss_feed($link, $feed_url, $feed) {
@ -30,13 +34,14 @@
$num_unread = 0; $num_unread = 0;
if ($rss) { if ($rss) {
pg_query("BEGIN");
$result = pg_query("SELECT title FROM ttrss_feeds WHERE id = '$feed'"); $result = pg_query("SELECT title FROM ttrss_feeds WHERE id = '$feed'");
$registered_title = pg_fetch_result($result, 0, "title"); $registered_title = pg_fetch_result($result, 0, "title");
if (!$registered_title) { if (!$registered_title) {
$feed_title = $rss->channel["title"]; $feed_title = $rss->channel["title"];
pg_query("UPDATE ttrss_feeds SET title = '$feed_title' WHERE id = '$feed'"); pg_query("UPDATE ttrss_feeds SET title = '$feed_title' WHERE id = '$feed'");
} }
@ -49,15 +54,22 @@
if (!$entry_guid) $entry_guid = $item["link"]; if (!$entry_guid) $entry_guid = $item["link"];
$entry_timestamp = ""; $entry_timestamp = "";
$rss_2_date = $item['pubdate']; $rss_2_date = $item['pubdate'];
$rss_1_date = $item['dc']['date']; $rss_1_date = $item['dc']['date'];
$atom_date = $item['issued']; $atom_date = $item['issued'];
$no_orig_date = 'false';
if ($atom_date != "") $entry_timestamp = parse_w3cdtf($atom_date); if ($atom_date != "") $entry_timestamp = parse_w3cdtf($atom_date);
if ($rss_1_date != "") $entry_timestamp = parse_w3cdtf($rss_1_date); if ($rss_1_date != "") $entry_timestamp = parse_w3cdtf($rss_1_date);
if ($rss_2_date != "") $entry_timestamp = strtotime($rss_2_date); if ($rss_2_date != "") $entry_timestamp = strtotime($rss_2_date);
if ($entry_timestamp == "") $entry_timestamp = 0; // if ($rss_3_date != "") $entry_timestamp = strtotime($rss_3_date);
if ($entry_timestamp == "") {
$entry_timestamp = time();
$no_orig_date = 'true';
}
if (!$entry_timestamp) continue; if (!$entry_timestamp) continue;
@ -77,7 +89,7 @@
$result = pg_query($link, " $result = pg_query($link, "
SELECT SELECT
id,unread,md5_hash,last_read, id,unread,md5_hash,last_read,no_orig_date,title,
EXTRACT(EPOCH FROM updated) as updated_timestamp EXTRACT(EPOCH FROM updated) as updated_timestamp
FROM FROM
ttrss_entries ttrss_entries
@ -89,11 +101,12 @@
$entry_timestamp = strftime("%Y/%m/%d %H:%M:%S", $entry_timestamp); $entry_timestamp = strftime("%Y/%m/%d %H:%M:%S", $entry_timestamp);
$query = "INSERT INTO ttrss_entries $query = "INSERT INTO ttrss_entries
(title, guid, link, updated, content, feed_id, md5_hash) (title, guid, link, updated, content, feed_id,
md5_hash, no_orig_date)
VALUES VALUES
('$entry_title', '$entry_guid', '$entry_link', ('$entry_title', '$entry_guid', '$entry_link',
'$entry_timestamp', '$entry_content', '$feed', '$entry_timestamp', '$entry_content', '$feed',
'$content_md5')"; '$content_md5', $no_orig_date)";
$result = pg_query($link, $query); $result = pg_query($link, $query);
@ -108,33 +121,41 @@
$unread = pg_fetch_result($result, 0, "unread"); $unread = pg_fetch_result($result, 0, "unread");
$md5_hash = pg_fetch_result($result, 0, "md5_hash"); $md5_hash = pg_fetch_result($result, 0, "md5_hash");
$no_orig_date = pg_fetch_result($result, 0, "no_orig_date");
// if ($md5_hash != $content_md5 && CONTENT_CHECK_MD5) $orig_title = pg_fetch_result($result, 0, "title");
// $unread = "true";
if (!$last_read || $md5_hash != $content_md5) { // disable update detection for posts which didn't have correct
$last_read = 'null'; // publishment date, because they will always register as updated
// sadly this doesn't catch feed generators which input current date
// in posts all the time (some planets do this)
if ($no_orig_date != 't' && (!$last_read || $md5_hash != $content_md5)) {
$last_read_qpart = 'last_read = null,';
} else { } else {
$last_read = "'$last_read'"; $last_read_qpart = '';
} }
// if ($unread || !CONTENT_CHECK_MD5) { // mark post as updated on title change
// $updated_query_part = "updated = '$entry_timestamp',"; // maybe we should mark it as unread instead?
// }
// if ($updated_timestamp > $entry_timestamp) { if ($orig_title != $entry_title) {
// $unread = "true"; $last_read_qpart = 'last_read = null,';
// print "$updated_timestamp : $entry_timestamp<br>"; }
// }
// don't bother updating timestamps on posts with broken pubDate
if ($no_orig_date != 't') {
$update_timestamp_qpart = "updated = '$entry_timestamp_fmt',";
}
$query = "UPDATE ttrss_entries $query = "UPDATE ttrss_entries
SET SET
title ='$entry_title', title ='$entry_title',
link = '$entry_link', link = '$entry_link',
updated = '$entry_timestamp_fmt', $update_timestamp_qpart
$last_read_qpart
content = '$entry_content', content = '$entry_content',
md5_hash = '$content_md5', md5_hash = '$content_md5',
last_read = $last_read,
unread = '$unread' unread = '$unread'
WHERE WHERE
id = '$entry_id'"; id = '$entry_id'";
@ -151,6 +172,8 @@
$result = pg_query($link, "UPDATE ttrss_feeds SET last_updated = NOW()"); $result = pg_query($link, "UPDATE ttrss_feeds SET last_updated = NOW()");
} }
pg_query("COMMIT");
} }
} }

View File

@ -34,5 +34,6 @@ create table ttrss_entries (id serial not null primary key,
md5_hash varchar(200) not null unique, md5_hash varchar(200) not null unique,
content text not null, content text not null,
last_read timestamp, last_read timestamp,
unread boolean default true); no_orig_date boolean not null default false,
unread boolean not null default true);