add some more bayes stuff

This commit is contained in:
Andrew Dolgov 2015-06-17 15:15:04 +03:00
parent 853cc128d6
commit 59e83455f1
4 changed files with 191 additions and 56 deletions

View File

@ -680,6 +680,7 @@
$article = array("owner_uid" => $owner_uid, // read only
"guid" => $entry_guid, // read only
"guid_hashed" => $entry_guid_hashed, // read only
"title" => $entry_title,
"content" => $entry_content,
"link" => $entry_link,
@ -968,6 +969,10 @@
lang = '$entry_language'
WHERE id = '$ref_id'");
// update aux data
db_query("UPDATE ttrss_user_entries
SET score = '$score' WHERE ref_id = '$ref_id'");
if ($mark_unread_on_update) {
db_query("UPDATE ttrss_user_entries
SET last_read = null, unread = true WHERE ref_id = '$ref_id'");

View File

@ -5,6 +5,7 @@ class Af_Sort_Bayes extends Plugin {
private $host;
private $filters = array();
private $dbh;
private $score_modifier = 50;
function about() {
return array(1.0,
@ -31,8 +32,39 @@ class Af_Sort_Bayes extends Plugin {
$article_id = (int) $_REQUEST["article_id"];
$train_up = sql_bool_to_bool($_REQUEST["train_up"]);
print "FIXME: $article_id :: $train_up";
$category = $train_up ? "GOOD" : "NEUTRAL";
$nbs = new NaiveBayesianStorage($_SESSION["uid"]);
$nb = new NaiveBayesian($nbs);
$result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
$article_id . " AND owner_uid = " . $_SESSION["uid"]);
if ($this->dbh->num_rows($result) != 0) {
$guid = $this->dbh->fetch_result($result, 0, "guid");
$title = $this->dbh->fetch_result($result, 0, "title");
$content = mb_strtolower($title . " " . strip_tags($this->dbh->fetch_result($result, 0, "content")));
$score = $this->dbh->fetch_result($result, 0, "score");
$this->dbh->query("BEGIN");
if ($nb->untrain($guid, $content)) {
if ($score >= $this->score_modifier) $score -= $this->score_modifier;
}
$nb->train($guid, $nbs->getCategoryByName($category), $content);
if ($category == "GOOD") $score += $this->score_modifier;
$this->dbh->query("UPDATE ttrss_user_entries SET score = '$score' WHERE ref_id = $article_id AND owner_uid = " . $_SESSION["uid"]);
$nb->updateProbabilities();
$this->dbh->query("COMMIT");
}
print "$article_id :: $category";
}
function get_js() {
@ -54,9 +86,11 @@ class Af_Sort_Bayes extends Plugin {
function init_database() {
$prefix = "ttrss_plugin_af_sort_bayes";
/*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);
$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);*/
// TODO there probably should be a way for plugins to determine their schema version to upgrade tables
/*$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_wordfreqs", false);
$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_references", false);
$this->dbh->query("DROP TABLE IF EXISTS ${prefix}_categories", false);*/
$this->dbh->query("BEGIN");
@ -69,9 +103,9 @@ class Af_Sort_Bayes extends Plugin {
owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
word_count BIGINT NOT NULL DEFAULT '0')");
$this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_documents (
$this->dbh->query("CREATE TABLE IF NOT EXISTS ${prefix}_references (
id SERIAL NOT NULL PRIMARY KEY,
document varchar(250) NOT NULL DEFAULT '',
document_id VARCHAR(255) NOT NULL,
category_id INTEGER NOT NULL REFERENCES ${prefix}_categories(id) ON DELETE CASCADE,
owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
content text NOT NULL)");
@ -82,6 +116,17 @@ class Af_Sort_Bayes extends Plugin {
owner_uid INTEGER NOT NULL REFERENCES ttrss_users(id) ON DELETE CASCADE,
count BIGINT NOT NULL DEFAULT '0')");
$owner_uid = @$_SESSION["uid"];
if ($owner_uid) {
$result = $this->dbh->query("SELECT id FROM ${prefix}_categories WHERE owner_uid = $owner_uid LIMIT 1");
if ($this->dbh->num_rows($result) == 0) {
$this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('GOOD', $owner_uid)");
$this->dbh->query("INSERT INTO ${prefix}_categories (category, owner_uid) VALUES ('NEUTRAL', $owner_uid)");
}
}
$this->dbh->query("COMMIT");
}
@ -98,6 +143,52 @@ class Af_Sort_Bayes extends Plugin {
function hook_article_filter($article) {
$owner_uid = $article["owner_uid"];
$nbs = new NaiveBayesianStorage($owner_uid);
$nb = new NaiveBayesian($nbs);
$categories = $nbs->getCategories();
if (count($categories) > 0) {
$count_neutral = 0;
$count_good = 0;
$id_good = 0;
$id_neutral = 0;
foreach ($categories as $id => $cat) {
if ($cat["category"] == "GOOD") {
$id_good = $id;
$count_good += $cat["word_count"];
} else if ($cat["category"] == "NEUTRAL") {
$id_neutral = $id;
$count_neutral += $cat["word_count"];
}
}
$dst_category = $id_neutral;
$bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
if ($count_neutral >= 3000 && $count_good >= 1000) {
// enable automatic categorization
$result = $nb->categorize($bayes_content);
if (count($result) == 2) {
$prob_good = $result[$id_good];
$prob_neutral = $result[$id_neutral];
if ($prob_good > 0.90 && $prob_good > $prob_neutral) {
//$dst_category = $id_good; // should we autofile as good or not? idk
$article["score_modifier"] += $this->score_modifier;
}
}
}
$nb->train($article["guid_hashed"], $dst_category, $bayes_content);
$nb->updateProbabilities();
}
return $article;

View File

@ -85,6 +85,7 @@
reset($tokens);
while (list($token, $count) = each($tokens)) {
if ($this->nbs->wordExists($token)) {
$word = $this->nbs->getWord($token, $category);
@ -120,8 +121,10 @@
function train($doc_id, $category_id, $content) {
$ret = false;
// if this doc_id already trained, no trained
if (!$this->nbs->getReference($doc_id)) {
if (!$this->nbs->getReference($doc_id, false)) {
$tokens = $this->_getTokens($content);
while (list($token, $count) = each($tokens)) {
@ -149,15 +152,21 @@
*/
function untrain($doc_id) {
$ref = $this->nbs->getReference($doc_id);
$tokens = $this->_getTokens($ref['content']);
while (list($token, $count) = each($tokens)) {
$this->nbs->removeWord($token, $count, $ref['category_id']);
if (isset($ref['content'])) {
$tokens = $this->_getTokens($ref['content']);
while (list($token, $count) = each($tokens)) {
$this->nbs->removeWord($token, $count, $ref['category_id']);
}
$this->nbs->removeReference($doc_id);
return true;
} else {
return false;
}
$this->nbs->removeReference($doc_id);
return true;
}
/** rescale the results between 0 and 1.
@ -226,18 +235,18 @@
function _getTokens($string) {
$rawtokens = array();
$tokens = array();
$string = $this->_cleanString($string);
//$string = $this->_cleanString($string);
if (count(0 >= $this->ignore_list)) {
$this->ignore_list = $this->getIgnoreList();
}
$rawtokens = split("[^-_A-Za-z0-9]+", $string);
$rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
// remove some tokens
while (list(, $token) = each($rawtokens)) {
$token = trim($token);
if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
$tokens[$token]++;
}
}

View File

@ -61,25 +61,36 @@
*/
function getCategories() {
$categories = array();
$rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories');
$rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories WHERE owner_uid = ' . $this->owner_uid);
while ($this->con->fetch_assoc($rs)) {
$categories[$rs['category_id']] = array('probability' => $rs['probability'],
'word_count' => $rs['word_count']
while ($line = $this->con->fetch_assoc($rs)) {
$categories[$line['id']] = array('probability' => $line['probability'],
'category' => $line['category'],
'word_count' => $line['word_count']
);
}
return $categories;
}
function getCategoryByName($category) {
$rs = $this->con->query("SELECT id FROM ttrss_plugin_af_sort_bayes_categories WHERE category = '" .
$this->con->escape_string($category) . "' AND owner_uid = " . $this->owner_uid);
if ($this->con->num_rows($rs) != 0) {
return $this->con->fetch_result($rs, 0, "id");
}
return false;
}
/** see if the word is an already learnt word.
@return bool
@param string word
*/
function wordExists($word) {
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "'");
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND
owner_uid = " . $this->owner_uid);
return $this->con->num_rows($rs) != 0;
}
@ -92,13 +103,13 @@
function getWord($word, $category_id) {
$details = array();
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'");
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
$this->con->escape_string($word) . "' AND category_id=" . (int)$category_id);
if ($this->con->num_rows($rs) == 0 ) {
$details['count'] = 0;
}
else {
$details['count'] = $rs['count'];
} else {
$details['count'] = $this->con->fetch_result($rs, 0, "count");
}
return $details;
@ -116,10 +127,14 @@
$oldword = $this->getWord($word, $category_id);
if (0 == $oldword['count']) {
return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count) VALUES ('" . $this->con->escape_string($word) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string((int) $count) . "')");
return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count, owner_uid)
VALUES ('" . $this->con->escape_string($word) . "', '" .
(int)$category_id . "', '" .
(int)$count . "', '".
$this->owner_uid . "')");
}
else {
return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
}
}
@ -134,10 +149,14 @@
$oldword = $this->getWord($word, $category_id);
if (0 != $oldword['count'] && 0 >= ($oldword['count'] - $count)) {
return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND category_id='" . $this->con->escape_string($category_id) . "'");
return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
$this->con->escape_string($word) . "' AND category_id='" .
$this->con->escape_string($category_id) . "'");
}
else {
return $this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " .
(int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "'
AND word = '" . $this->con->escape_string($word) . "'");
}
}
@ -148,26 +167,23 @@
*/
function updateProbabilities() {
// first update the word count of each category
$rs = $this->con->query("SELECT category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE 1 GROUP BY category_id");
$total_words = 0;
$rs = $this->con->query("SELECT SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE owner_uid = ".$this->owner_uid);
while ($this->con->fetch_assoc($rs)) {
$total_words += $rs['total'];
}
$rs->moveStart();
$total_words = $this->con->fetch_result($rs, 0, "total");
if ($total_words == 0) {
$this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE 1");
$this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE owner_uid = " . $this->owner_uid);
return true;
}
while ($this->con->fetch_assoc($rs)) {
$proba = $rs['total'] / $total_words;
$this->con->execute("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $rs['total'] . ", probability=" . $proba . " WHERE category_id = '" . $rs['category_id'] . "'");
$rs = $this->con->query("SELECT tc.id AS category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_categories AS tc
LEFT JOIN ttrss_plugin_af_sort_bayes_wordfreqs AS tw ON (tc.id = tw.category_id) WHERE tc.owner_uid = ".$this->owner_uid." GROUP BY tc.id");
while ($line = $this->con->fetch_assoc($rs)) {
$proba = (int)$line['total'] / $total_words;
$this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $line['total'] .
", probability=" . $proba . " WHERE id = '" . $line['category_id'] . "'");
}
return true;
@ -181,8 +197,10 @@
@param string content of the reference
*/
function saveReference($doc_id, $category_id, $content) {
return $this->con->execute("INSERT INTO ttrss_plugin_af_sort_bayes_references (id, category_id, content) VALUES ('" . $this->con->escape_string($doc_id) . "', '" . $this->con->escape_string($category_id) . "', '" . $this->con->escape_string($content) . "')");
return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_references (document_id, category_id, owner_uid) VALUES
('" . $this->con->escape_string($doc_id) . "', '" .
(int)$category_id . "', " .
(int)$this->owner_uid . ")");
}
/** get a reference from the database.
@ -190,17 +208,29 @@
@return array reference( category_id => ...., content => ....)
@param string id
*/
function getReference($doc_id) {
$ref = array();
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'");
function getReference($doc_id, $include_content = true)
{
if ($this->con->num_rows($rs) == 0 ) {
$ref = array();
$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" .
$this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);
if ($this->con->num_rows($rs) == 0) {
return $ref;
}
$ref['category_id'] = $rs['category_id'];
$ref['content'] = $rs['content'];
$ref['id'] = $rs['id'];
$ref['category_id'] = $this->con->fetch_result($rs, 0, 'category_id');
$ref['id'] = $this->con->fetch_result($rs, 0, 'id');
$ref['document_id'] = $this->con->fetch_result($rs, 0, 'document_id');
if ($include_content) {
$rs = $this->con->query("SELECT content, title FROM ttrss_entries WHERE guid = '" .
$this->con->escape_string($ref['document_id']) . "'");
if ($this->con->num_rows($rs) != 0) {
$ref['content'] = mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content')));
}
}
return $ref;
}
@ -212,7 +242,7 @@
*/
function removeReference($doc_id) {
return $this->con->execute("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE id='" . $this->con->escape_string($doc_id) . "'");
return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" . $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);
}
}