Source of file Database.php

Size: 8,563 Bytes - Last Modified: 2017-03-18T21:13:43+01:00

C:/Users/Mike/Desktop/NewsScraper/Article_Analytics/Database.php

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
<?php

require_once "Article.php";

/**
 * @author Michael Dombrowski
 */
class Database{
	private $columns = "`ArticleURL`, `Headline`, `Subtitle`, `Author`, `Publisher`, `PublishDate`, `ArticleSources`,
		`TextSources`, `RetrievalDate`, `ArticleSection`, `GradeLevel`, `IsPrimarySource`, `HasUpdates`, `HasNotes`,
		`ArticleText`";

	/** @var  \PDO */
	private $pdo;

	/** @var  \Article */
	private $articles;

	private $fragments = [];

	/**
	 * @param \PDO $pdo
	 */
	public function __construct(\PDO $pdo){
		$this->pdo = $pdo;
		$this->articles = [];
	}

	/**
	 * @return array|Article
	 */
	public function getArticles(): array {
		return $this->articles;
	}

	/**
	 * Returns \Article object with the given URL
	 * @param string $url
	 * @return \Article
	 */
	public function getArticleByURL(string $url): \Article {
		$q = $this->pdo->prepare("SELECT * from `articles` WHERE `ArticleURL`=:url");
		$q->bindValue(":url", $url, PDO::PARAM_STR);
		$q->execute();
		return $this->makeArticleFromDB($q->fetch(PDO::FETCH_ASSOC));
	}

	/**
	 * Returns array of \Article objects that have the same publisher as the given $publisher
	 * @param string $publisher
	 * @return array|Article
	 */
	public function getArticlesByPublisher(string $publisher): array {
		$selected = [];
		$q = $this->pdo->prepare("SELECT ".$this->columns." from `articles` WHERE `Publisher`=:publisher");
		$q->bindValue(":publisher", $publisher, PDO::PARAM_STR);
		$q->execute();
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			$selected[] = $this->makeArticleFromDB($a);
		}
		return $selected;
	}

	/**
	 * Returns array of \Article objects that have the same author as the given $author
	 * @param string $author
	 * @return array|Article
	 */
	public function getArticlesByAuthor(string $author): array {
		$selected = [];
		$q = $this->pdo->prepare("SELECT ".$this->columns." from `articles` WHERE `Author`=:author");
		$q->bindValue(":author", $author, PDO::PARAM_STR);
		$q->execute();
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			$selected[] = $this->makeArticleFromDB($a);
		}
		return $selected;
	}

	/**
	 * Returns array of \Article objects that have the same section as the given $section
	 * @param string $section
	 * @return array|Article
	 */
	public function getArticlesBySection(string $section): array {
		$selected = [];
		$q = $this->pdo->prepare("SELECT ".$this->columns." from `articles` WHERE `ArticleSection` LIKE :section");
		$q->bindValue(":section", "%\"".$section."\"%", PDO::PARAM_STR);
		$q->execute();
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			$sections = json_decode($a["ArticleSection"], true);
			foreach($sections as $s){
				$article = $this->makeArticleFromDB($a);
				if(mb_strtolower($section) == mb_strtolower($s) && !in_array($article, $selected, true)){
					$selected[] = $article;
				}
			}
		}
		return $selected;
	}

	/**
	 * Returns array of \Article objects that were published on a given day of the week
	 * @param string $day Day of the week
	 * @return array|Article
	 */
	public function getArticlesByDayOfWeek(string $day): array {
		$selected = [];
		foreach($this->articles as $a){
			/* @var $a \Article */
			if(date_format($a->getPublishDate(), "l") == $day){
				$selected[] = $a;
			}
		}
		return $selected;
	}

	/**
	 * Returns a list of all the sections any article in the database was published in
	 * @return array|string
	 */
	public function listAllSections(): array {
		$sections = [];
		$q = $this->pdo->query("SELECT `ArticleSection` from `articles` GROUP BY `ArticleSection`");
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			$section = json_decode($a["ArticleSection"], true);
			foreach($section as $s){
				/* @var $s string */
				if(!in_array(mb_strtolower($s), $sections, true)){
					$sections[] = mb_strtolower($s);
				}
			}
		}
		sort($sections);
		return $sections;
	}

	/**
	 * Returns a list of all the authors any article in the database wrote
	 * @return array|string
	 */
	public function listAllAuthors(): array {
		$authors = [];
		$q = $this->pdo->query("SELECT `Author` from `articles` GROUP BY `Author`");
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			if(!in_array($a["Author"], $authors, true)){
				$authors[] = $a["Author"];
			}
		}
		sort($authors);
		return $authors;
	}

	/**
	 * Returns a list of all the publishers in the database
	 * @return array|string
	 */
	public function listAllPublishers(): array {
		$publishers = [];
		$q = $this->pdo->query("SELECT `Publisher` from `articles` GROUP BY `Publisher`");
		foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
			if(!in_array($a["Publisher"], $publishers, true)){
				$publishers[] = $a["Publisher"];
			}
		}
		sort($publishers);
		return $publishers;
	}

	/**
	 * Reads database of given \PDO and creates \Article objects for each row
	 */
	public function readDatabase() {
		$q = $this->pdo->query("SELECT count(*) FROM `Articles`");
		$count = $q->fetch(PDO::FETCH_ASSOC)["count(*)"];
		$numFetched = 0;
		$start = 0;
		// Fetch only 50 articles at a time to limit the memory impact
		while($numFetched < $count){
			$q = $this->pdo->query("SELECT ".$this->columns." FROM `Articles` LIMIT $start,100");
			foreach($q->fetchAll(PDO::FETCH_ASSOC) as $a){
				$this->articles[] = $this->makeArticleFromDB($a);
				$numFetched += 1;
			}
			$start += 100;
		}
	}

	private function getFragments(){
		$q = $this->pdo->prepare("SELECT `ArticleURL`, `IsSource`, `Guess`, count(*) FROM `Fragments-Table` 
			GROUP BY `ArticleURL`, `IsSource`, `Guess`");
		$q->execute();
		$rows = $q->fetchAll(PDO::FETCH_ASSOC);

		// Index fragments by ArticleURL for faster access times
		foreach($rows as $r){
			if(!isset($this->fragments[$r["ArticleURL"]])){
				$this->fragments[$r["ArticleURL"]] = [];
			}
			$this->fragments[$r["ArticleURL"]][] = ["IsSource"=>$r["IsSource"], "Guess"=>$r["Guess"], "count(*)"=>$r["count(*)"]];
		}
	}

	private function setClassificationData(\Article $article){
		if(count($this->fragments) <= 0){
			$this->getFragments();
		}

		foreach($this->fragments[$article->getArticleURL()] as $r){
			$article->setNumSentences($article->getNumSentences() + $r["count(*)"]);
			if($r["IsSource"] != -1){
				$article = $this->setClassification($article, $r["IsSource"], $r["count(*)"]);
			}
			else if($r["Guess"] != null){
				$article = $this->setClassification($article, $r["Guess"], $r["count(*)"]);
			}
		}

		return $article;
	}

	private function setClassification(\Article $article, $value, $num){
		switch($value){
			case 0:
				$article->setNumNotSourced($article->getNumNotSourced() + $num);
				break;
			case 1:
				$article->setNumOriginalReporting($article->getNumOriginalReporting() + $num);
				break;
			case 2:
				$article->setNumPrimarySource($article->getNumPrimarySource() + $num);
				break;
			case 3:
				$article->setNumSecondarySource($article->getNumSecondarySource() + $num);
				break;
			case 4:
				$article->setNumQuote($article->getNumQuote() + $num);
				break;
			case 5:
				$article->setNumNeedsSource($article->getNumNeedsSource() + $num);
				break;
			default:
				break;
		}
		return $article;
	}

	private function makeArticleFromDB($a){
		$article = new Article();

		$pDate = new DateTime();
		$pDate->setTimestamp(strtotime($a["PublishDate"]));
		$fDate = new DateTime();
		$fDate->setTimestamp(strtotime($a["RetrievalDate"]));

		$article->setArticleURL($a["ArticleURL"]);
		$article->setHeadline($a["Headline"]);
		$article->setSubtitle($a["Subtitle"]);
		$article->setAuthor($a["Author"]);
		$article->setPublisher($a["Publisher"]);
		$article->setPublishDate($pDate);
		if(isset($a["ArticleText"])){
			$article->setArticleText($a["ArticleText"]);
		}
		if(isset($a["ArticleHTML"])){
			$article->setArticleHTML($a["ArticleHTML"]);
		}
		$article->setArticleSources(json_decode($a["ArticleSources"], true));
		$article->setTextSources(json_decode($a["TextSources"], true));
		$article->setRetrievalDate($fDate);
		$article->setArticleSection(json_decode($a["ArticleSection"], true));
		$article->setGradeLevel($a["GradeLevel"]);
		$article->setIsPrimarySource($a["IsPrimarySource"]);
		$article->setHasUpdates($a["HasUpdates"]);
		$article->setHasNotes($a["HasNotes"]);

		$article = $this->setClassificationData($article);
		return $article;
	}
}