From 59e83455f188e48796383bfe4be99deb81cb1caa Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 17 Jun 2015 15:15:04 +0300 Subject: [PATCH] add some more bayes stuff --- include/rssfuncs.php | 5 + plugins/af_sort_bayes/init.php | 103 ++++++++++++++++- .../af_sort_bayes/lib/class.naivebayesian.php | 31 +++-- .../lib/class.naivebayesianstorage.php | 108 +++++++++++------- 4 files changed, 191 insertions(+), 56 deletions(-) diff --git a/include/rssfuncs.php b/include/rssfuncs.php index 17233914e..4dbb7c18e 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -680,6 +680,7 @@ $article = array("owner_uid" => $owner_uid, // read only "guid" => $entry_guid, // read only + "guid_hashed" => $entry_guid_hashed, // read only "title" => $entry_title, "content" => $entry_content, "link" => $entry_link, @@ -968,6 +969,10 @@ lang = '$entry_language' WHERE id = '$ref_id'"); + // update aux data + db_query("UPDATE ttrss_user_entries + SET score = '$score' WHERE ref_id = '$ref_id'"); + if ($mark_unread_on_update) { db_query("UPDATE ttrss_user_entries SET last_read = null, unread = true WHERE ref_id = '$ref_id'"); diff --git a/plugins/af_sort_bayes/init.php b/plugins/af_sort_bayes/init.php index 213c6aede..23f38ec25 100644 --- a/plugins/af_sort_bayes/init.php +++ b/plugins/af_sort_bayes/init.php @@ -5,6 +5,7 @@ class Af_Sort_Bayes extends Plugin { private $host; private $filters = array(); private $dbh; + private $score_modifier = 50; function about() { return array(1.0, @@ -31,8 +32,39 @@ class Af_Sort_Bayes extends Plugin { $article_id = (int) $_REQUEST["article_id"]; $train_up = sql_bool_to_bool($_REQUEST["train_up"]); - print "FIXME: $article_id :: $train_up"; + $category = $train_up ? "GOOD" : "NEUTRAL"; + $nbs = new NaiveBayesianStorage($_SESSION["uid"]); + $nb = new NaiveBayesian($nbs); + + $result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " . + $article_id . " AND owner_uid = " . $_SESSION["uid"]); + + if ($this->dbh->num_rows($result) != 0) { + $guid = $this->dbh->fetch_result($result, 0, "guid"); + $title = $this->dbh->fetch_result($result, 0, "title"); + $content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content"))); + $score = $this->dbh->fetch_result($result, 0, "score"); + + $this->dbh->query("BEGIN"); + + if ($nb->untrain($guid, $content)) { + if ($score >= $this->score_modifier) $score -= $this->score_modifier; + } + + $nb->train($guid, $nbs->getCategoryByName($category), $content); + + if ($category == "GOOD") $score += $this->score_modifier; + + $this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]); + + $nb->updateProbabilities(); + + $this->dbh->query("COMMIT"); + + } + + print "$article_id :: $category"; } function get_js() { @@ -54,9 +86,11 @@ class Af_Sort_Bayes extends Plugin { function init_database() { $prefix = "ttrss_plugin_af_sort_bayes"; - /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false); - $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false); - $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);*/ + // TODO there probably should be a way for plugins to determine their schema version to upgrade tables + + /*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false); + $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false); + $this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/ $this->dbh->query("BEGIN"); @@ -69,9 +103,9 @@ class Af_Sort_Bayes extends Plugin { owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, word_count BIGINT NOT NULL DEFAULT '0')"); - $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_documents ( + $this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references ( id SERIAL NOT NULL PRIMARY KEY, - document varchar(250) NOT NULL DEFAULT '', + document_id VARCHAR(255) NOT NULL, category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE, owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, content text NOT NULL)"); @@ -82,6 +116,17 @@ class Af_Sort_Bayes extends Plugin { owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE, count BIGINT NOT NULL DEFAULT '0')"); + $owner_uid = @$_SESSION["uid"]; + + if ($owner_uid) { + $result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1"); + + if ($this->dbh->num_rows($result) == 0) { + $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)"); + $this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('NEUTRAL', $owner_uid)"); + } + } + $this->dbh->query("COMMIT"); } @@ -98,6 +143,52 @@ class Af_Sort_Bayes extends Plugin { function hook_article_filter($article) { $owner_uid = $article["owner_uid"]; + $nbs = new NaiveBayesianStorage($owner_uid); + $nb = new NaiveBayesian($nbs); + + $categories = $nbs->getCategories(); + + if (count($categories) > 0) { + + $count_neutral = 0; + $count_good = 0; + $id_good = 0; + $id_neutral = 0; + + foreach ($categories as $id => $cat) { + if ($cat["category"] == "GOOD") { + $id_good = $id; + $count_good += $cat["word_count"]; + } else if ($cat["category"] == "NEUTRAL") { + $id_neutral = $id; + $count_neutral += $cat["word_count"]; + } + } + + $dst_category = $id_neutral; + + $bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"])); + + if ($count_neutral >= 3000 && $count_good >= 1000) { + // enable automatic categorization + + $result = $nb->categorize($bayes_content); + + if (count($result) == 2) { + $prob_good = $result[$id_good]; + $prob_neutral = $result[$id_neutral]; + + if ($prob_good > 0.90 && $prob_good > $prob_neutral) { + //$dst_category = $id_good; // should we autofile as good or not? idk + $article["score_modifier"] += $this->score_modifier; + } + } + } + + $nb->train($article["guid_hashed"], $dst_category, $bayes_content); + + $nb->updateProbabilities(); + } return $article; diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php index 1c2ef463b..c80c3f215 100644 --- a/plugins/af_sort_bayes/lib/class.naivebayesian.php +++ b/plugins/af_sort_bayes/lib/class.naivebayesian.php @@ -85,6 +85,7 @@ reset($tokens); while (list($token, $count) = each($tokens)) { + if ($this->nbs->wordExists($token)) { $word = $this->nbs->getWord($token, $category); @@ -120,8 +121,10 @@ function train($doc_id, $category_id, $content) { $ret = false; + // if this doc_id already trained, no trained - if (!$this->nbs->getReference($doc_id)) { + if (!$this->nbs->getReference($doc_id, false)) { + $tokens = $this->_getTokens($content); while (list($token, $count) = each($tokens)) { @@ -149,15 +152,21 @@ */ function untrain($doc_id) { $ref = $this->nbs->getReference($doc_id); - $tokens = $this->_getTokens($ref['content']); - while (list($token, $count) = each($tokens)) { - $this->nbs->removeWord($token, $count, $ref['category_id']); + if (isset($ref['content'])) { + + $tokens = $this->_getTokens($ref['content']); + + while (list($token, $count) = each($tokens)) { + $this->nbs->removeWord($token, $count, $ref['category_id']); + } + + $this->nbs->removeReference($doc_id); + + return true; + } else { + return false; } - - $this->nbs->removeReference($doc_id); - - return true; } /** rescale the results between 0 and 1. @@ -226,18 +235,18 @@ function _getTokens($string) { $rawtokens = array(); $tokens = array(); - $string = $this->_cleanString($string); + //$string = $this->_cleanString($string); if (count(0 >= $this->ignore_list)) { $this->ignore_list = $this->getIgnoreList(); } - $rawtokens = split("[^-_A-Za-z0-9]+", $string); + $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY); // remove some tokens while (list(, $token) = each($rawtokens)) { $token = trim($token); - if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { + if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { $tokens[$token]++; } } diff --git a/plugins/af_sort_bayes/lib/class.naivebayesianstorage.php b/plugins/af_sort_bayes/lib/class.naivebayesianstorage.php index fccdcaf06..4727705ef 100644 --- a/plugins/af_sort_bayes/lib/class.naivebayesianstorage.php +++ b/plugins/af_sort_bayes/lib/class.naivebayesianstorage.php @@ -61,25 +61,36 @@ */ function getCategories() { $categories = array(); - $rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories'); + $rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories WHERE owner_uid = ' . $this->owner_uid); - while ($this->con->fetch_assoc($rs)) { - $categories[$rs['category_id']] = array('probability' => $rs['probability'], - 'word_count' => $rs['word_count'] + while ($line = $this->con->fetch_assoc($rs)) { + $categories[$line['id']] = array('probability' => $line['probability'], + 'category' => $line['category'], + 'word_count' => $line['word_count'] ); - - } return $categories; } + function getCategoryByName($category) { + $rs = $this->con->query("SELECT id FROM ttrss_plugin_af_sort_bayes_categories WHERE category = '" . + $this->con->escape_string($category) . "' AND owner_uid = " . $this->owner_uid); + + if ($this->con->num_rows($rs) != 0) { + return $this->con->fetch_result($rs, 0, "id"); + } + + return false; + } + /** see if the word is an already learnt word. @return bool @param string word */ function wordExists($word) { - $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "'"); + $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND + owner_uid = " . $this->owner_uid); return $this->con->num_rows($rs) != 0; } @@ -92,13 +103,13 @@ function getWord($word, $category_id) { $details = array(); - $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'"); + $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . + $this->con->escape_string($word) . "' AND category_id=" . (int)$category_id); if ($this->con->num_rows($rs) == 0 ) { $details['count'] = 0; - } - else { - $details['count'] = $rs['count']; + } else { + $details['count'] = $this->con->fetch_result($rs, 0, "count"); } return $details; @@ -116,10 +127,14 @@ $oldword = $this->getWord($word, $category_id); if (0 == $oldword['count']) { - return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count) VALUES ('" . $this->con->escape_string($word) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string((int) $count) . "')"); + return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count, owner_uid) + VALUES ('" . $this->con->escape_string($word) . "', '" . + (int)$category_id . "', '" . + (int)$count . "', '". + $this->owner_uid . "')"); } else { - return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'"); + return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'"); } } @@ -134,10 +149,14 @@ $oldword = $this->getWord($word, $category_id); if (0 != $oldword['count'] && 0 >= ($oldword['count'] - $count)) { - return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'"); + return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . + $this->con->escape_string($word) . "' AND category_id='" . + $this->con->escape_string($category_id) . "'"); } else { - return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'"); + return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " . + (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' + AND word = '" . $this->con->escape_string($word) . "'"); } } @@ -148,26 +167,23 @@ */ function updateProbabilities() { // first update the word count of each category - $rs = $this->con->query("SELECT category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE 1 GROUP BY category_id"); - $total_words = 0; + $rs = $this->con->query("SELECT SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE owner_uid = ".$this->owner_uid); - while ($this->con->fetch_assoc($rs)) { - $total_words += $rs['total']; - - } - - $rs->moveStart(); + $total_words = $this->con->fetch_result($rs, 0, "total"); if ($total_words == 0) { - $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE 1"); - + $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE owner_uid = " . $this->owner_uid); return true; } - while ($this->con->fetch_assoc($rs)) { - $proba = $rs['total'] / $total_words; - $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $rs['total'] . ", probability=" . $proba . " WHERE category_id = '" . $rs['category_id'] . "'"); - + $rs = $this->con->query("SELECT tc.id AS category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_categories AS tc + LEFT JOIN ttrss_plugin_af_sort_bayes_wordfreqs AS tw ON (tc.id = tw.category_id) WHERE tc.owner_uid = ".$this->owner_uid." GROUP BY tc.id"); + + while ($line = $this->con->fetch_assoc($rs)) { + + $proba = (int)$line['total'] / $total_words; + $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $line['total'] . + ", probability=" . $proba . " WHERE id = '" . $line['category_id'] . "'"); } return true; @@ -181,8 +197,10 @@ @param string content of the reference */ function saveReference($doc_id, $category_id, $content) { - - return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_references (id, category_id, content) VALUES ('" . $this->con->escape_string($doc_id) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string($content) . "')"); + return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_references (document_id, category_id, owner_uid) VALUES + ('" . $this->con->escape_string($doc_id) . "', '" . + (int)$category_id . "', " . + (int)$this->owner_uid . ")"); } /** get a reference from the database. @@ -190,17 +208,29 @@ @return array reference( category_id => ...., content => ....) @param string id */ - function getReference($doc_id) { - $ref = array(); - $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'"); + function getReference($doc_id, $include_content = true) + { - if ($this->con->num_rows($rs) == 0 ) { + $ref = array(); + $rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" . + $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid); + + if ($this->con->num_rows($rs) == 0) { return $ref; } - $ref['category_id'] = $rs['category_id']; - $ref['content'] = $rs['content']; - $ref['id'] = $rs['id']; + $ref['category_id'] = $this->con->fetch_result($rs, 0, 'category_id'); + $ref['id'] = $this->con->fetch_result($rs, 0, 'id'); + $ref['document_id'] = $this->con->fetch_result($rs, 0, 'document_id'); + + if ($include_content) { + $rs = $this->con->query("SELECT content, title FROM ttrss_entries WHERE guid = '" . + $this->con->escape_string($ref['document_id']) . "'"); + + if ($this->con->num_rows($rs) != 0) { + $ref['content'] = mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content'))); + } + } return $ref; } @@ -212,7 +242,7 @@ */ function removeReference($doc_id) { - return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'"); + return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" . $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid); } }