use ngram tokens instead of whole words for matching

This commit is contained in:
Andrew Dolgov 2015-06-17 18:15:41 +03:00
parent 4947c02e1a
commit 3dcd00e4c4
1 changed files with 5 additions and 4 deletions

View File

@ -16,6 +16,7 @@ class Af_Sort_Bayes extends Plugin {
function init($host) {
require_once __DIR__ . "/lib/class.naivebayesian.php";
require_once __DIR__ . "/lib/class.naivebayesian_ngram.php";
require_once __DIR__ . "/lib/class.naivebayesianstorage.php";
$this->host = $host;
@ -36,7 +37,7 @@ class Af_Sort_Bayes extends Plugin {
$category = $train_up ? "GOOD" : "NEUTRAL";
$nbs = new NaiveBayesianStorage($_SESSION["uid"]);
$nb = new NaiveBayesian($nbs);
$nb = new NaiveBayesianNgram($nbs);
$result = $this->dbh->query("SELECT score, guid, title, content FROM ttrss_entries, ttrss_user_entries WHERE ref_id = id AND id = " .
$article_id . " AND owner_uid = " . $_SESSION["uid"]);
@ -202,7 +203,7 @@ class Af_Sort_Bayes extends Plugin {
$owner_uid = $article["owner_uid"];
$nbs = new NaiveBayesianStorage($owner_uid);
$nb = new NaiveBayesian($nbs);
$nb = new NaiveBayesianNgram($nbs);
$categories = $nbs->getCategories();
@ -227,7 +228,7 @@ class Af_Sort_Bayes extends Plugin {
$bayes_content = mb_strtolower($article["title"] . " " . strip_tags($article["content"]));
if ($count_neutral >= 3000 && $count_good >= 1000) {
if ($count_neutral >= 20000 && $count_good >= 10000) {
// enable automatic categorization
$result = $nb->categorize($bayes_content);
@ -261,7 +262,7 @@ class Af_Sort_Bayes extends Plugin {
$this->dbh->query("COMMIT");
$nbs = new NaiveBayesianStorage($_SESSION["uid"]);
$nb = new NaiveBayesian($nbs);
$nb = new NaiveBayesianNgram($nbs);
$nb->updateProbabilities();
}