Source of file Analysis.php
Size: 9,119 Bytes - Last Modified: 2017-03-18T21:39:36+01:00
C:/Users/Mike/Desktop/NewsScraper/Article_Analytics/Analysis.php
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 | <?php $qualityDefinition = [-1=> "no links", 0=> "link", 1=> "newspaper source", 2=> "assumed primary source"]; /** * Prints into a table the article headline (linked to URL) and statistics * @param array|\Article $articles */ function analyticsByHeadline(array $articles){ foreach($articles as $a){ /* @var $a \Article */ $analytics = articleAnalytics($a); print "<tr>"; print "<td><a target='_blank' href='".$a->getArticleURL()."'>".$a->getHeadline()."</a>"; print "<br/><a href='author?author=".rawurlencode($a->getAuthor())."'>By: ".$a->getAuthor()."</td>"; print "<td>".number_format($analytics["originalReportingPercent"], 2)."%</td>"; print "<td onclick='return navigate(event, \"sources?url=".rawurlencode($a->getArticleURL())."\");'> <a href='sources?url=".rawurlencode($a->getArticleURL())."'>".$analytics["numLinks"] ."</a></td>"; print "<td>".number_format($analytics["linksPerMWords"], 2)."</td>"; print "<td>".number_format($analytics["averageLinkQuality"], 2)."</td>"; print "<td>".number_format($analytics["numSources"])."</td>"; print "<td>".number_format($analytics["sourcesPerMWords"], 2)."</td>"; print "<td>".number_format($analytics["numWords"])."</td>"; print "<td>".number_format($analytics["numSentences"])."</td>"; print "</tr>"; } } /** * Returns statistics from a given article including number of sources, number of words, and sources * per 1000 words. * @param \Article $article article you want statistics about * @return array */ function articleAnalytics(\Article $article): array { $numLinks = 0; $cumulativeLinkQuality = 0; $numWords = str_word_count($article->getArticleText()); $numSources = count($article->getTextSources()); $sourcesPerMWords = $numSources/($numWords/1000); $numSentences = $article->getNumSentences(); $unsourced = $article->getNumNotSourced(); $originalReporting = $article->getNumOriginalReporting(); $primarySource = $article->getNumPrimarySource(); $secondarySource = $article->getNumSecondarySource(); $quote = $article->getNumQuote(); $needsSource = $article->getNumNeedsSource(); foreach($article->getArticleSources() as $t){ $numLinks += 1; $cumulativeLinkQuality += $t["Quality"]; } $linksPerMWords = $numLinks/($numWords/1000); $linkQualityAverage = $numLinks == 0 ? -1 : $cumulativeLinkQuality/$numLinks; $avgUnsourced = $numSentences == 0 ? 0 : $unsourced/$numSentences; $avgOriginalReporting = $numSentences == 0 ? 0 : $originalReporting/$numSentences; $avgPrimarySource = $numSentences == 0 ? 0 : $primarySource/$numSentences; $avgSecondarySource = $numSentences == 0 ? 0 : $secondarySource/$numSentences; $avgQuote = $numSentences == 0 ? 0 : $quote/$numSentences; $avgNeedsSource = $numSentences == 0 ? 0 : $needsSource/$numSentences; return ["numLinks"=>$numLinks, "numWords"=>$numWords, "linksPerMWords"=>$linksPerMWords, "numSources"=>$numSources, "sourcesPerMWords"=>$sourcesPerMWords, "numSentences"=>$numSentences, "averageLinkQuality"=>$linkQualityAverage, "unsource"=>$unsourced, "originalReporting"=>$originalReporting, "primarySource"=>$primarySource, "secondarySource"=>$secondarySource, "quote"=>$quote, "needsSource"=>$needsSource, "originalReportingPercent"=>$avgOriginalReporting*100, "unsourcedPercent"=>$avgUnsourced*100, "primarySourcePercent"=>$avgPrimarySource*100, "secondarySourcePercent"=>$avgSecondarySource*100, "quotePercent"=>$avgQuote*100, "needsSourcePercent"=>$avgNeedsSource*100 ]; } /** * Generates average analytics for a given list of articles * Prints these analytics in h1.lead format * @param array|\Article $articles */ function overallAnalytics(array $articles){ $numLinks = 0; $numSources = 0; $numWords = 0; $numArticles = count($articles); $cumulativeGradeLevel= 0; $cumulativeLinkQuality = 0; $cumulativeUnsourced = 0; $cumulativeOriginalReporting = 0; $cumulativePrimarySource = 0; $cumulativeSecondarySource = 0; $cumulativeQuote = 0; $cumulativeNeedsSource = 0; $cumulativeSentenceCount = 0; foreach($articles as $a){ /* @var $a \Article */ $numSources += count($a->getTextSources()); $numWords += str_word_count($a->getArticleText()); $cumulativeGradeLevel += $a->getGradeLevel(); foreach($a->getArticleSources() as $t){ $numLinks += 1; $cumulativeLinkQuality += $t["Quality"]; } $cumulativeSentenceCount += $a->getNumSentences(); $cumulativeUnsourced += $a->getNumNotSourced(); $cumulativeOriginalReporting += $a->getNumOriginalReporting(); $cumulativePrimarySource += $a->getNumPrimarySource(); $cumulativeSecondarySource += $a->getNumSecondarySource(); $cumulativeQuote += $a->getNumQuote(); $cumulativeNeedsSource += $a->getNumNeedsSource(); } // Ternary operations added to resolve divide by zero errors $linksPerArticle = $numArticles == 0 ? 0 : $numLinks/$numArticles; $linksPerMWords = $numWords == 0 ? 0 : $numLinks/($numWords/1000); $sourcesPerArticle = $numArticles == 0 ? 0 : $numSources/$numArticles; $sourcesPerMWords = $numWords == 0 ? 0 : $numSources/($numWords/1000); $wordsPerArticle = $numArticles == 0 ? 0 : $numWords/$numArticles; $sentencesPerArticle = $numArticles == 0 ? 0 : $cumulativeSentenceCount/$numArticles; $avgGradeLevel = $numArticles == 0 ? 0 : $cumulativeGradeLevel/$numArticles; $avgUnsourced = $cumulativeSentenceCount == 0 ? 0 : $cumulativeUnsourced/$cumulativeSentenceCount; $avgOriginalReporting = $cumulativeSentenceCount == 0 ? 0 : $cumulativeOriginalReporting/$cumulativeSentenceCount; $avgPrimarySource = $cumulativeSentenceCount == 0 ? 0 : $cumulativePrimarySource/$cumulativeSentenceCount; $avgSecondarySource = $cumulativeSentenceCount == 0 ? 0 : $cumulativeSecondarySource/$cumulativeSentenceCount; $avgQuote = $cumulativeSentenceCount == 0 ? 0 : $cumulativeQuote/$cumulativeSentenceCount; $avgNeedsSource = $cumulativeSentenceCount == 0 ? 0 : $cumulativeNeedsSource/$cumulativeSentenceCount; $linkQualityAverage = $numLinks == 0 ? -1 : $cumulativeLinkQuality/$numLinks; global $qualityDefinition; $quality = $qualityDefinition[intval($linkQualityAverage)]; if(count($articles) > 1){ print "<p>Number of Articles: ".number_format($numArticles)."<br/>"; print "Average Percent Unsourced: ".number_format($avgUnsourced*100, 2)."%<br/>"; print "Average Percent Original Reporting: ".number_format($avgOriginalReporting*100, 2)."%<br/>"; print "Average Percent Primary Source: ".number_format($avgPrimarySource*100, 2)."%<br/>"; print "Average Percent Secondary Source: ".number_format($avgSecondarySource*100, 2)."%<br/>"; print "Average Percent Quote: ".number_format($avgQuote*100, 2)."%<br/>"; print "Average Percent That Needs A Source: ".number_format($avgNeedsSource*100, 2)."%<br/>"; print "Links per Article: ".number_format($linksPerArticle, 2)."<br/>"; print "Links per 1000 Words: ".number_format($linksPerMWords, 2)."<br/>"; print "Link Quality: ".number_format($linkQualityAverage, 2)." ($quality)<br/>"; print "Sources in Text per Article: ".number_format($sourcesPerArticle, 2)."<br/>"; print "Sources in Text per 1000 Words: ".number_format($sourcesPerMWords, 2)."<br/>"; print "Average Word Count: ".number_format($wordsPerArticle, 0)."<br/>"; print "Average Sentence Count: ".number_format($sentencesPerArticle, 0)."<br/>"; print "Average Flesch-Kincaid Grade Level: ".number_format($avgGradeLevel, 1)."</p>"; } else if(count($articles) == 1){ print "<h1 class='lead'>Number of Articles: ".number_format($numArticles)."</h1>"; print "<h1 class='lead'>Percent Unsourced: ".number_format($avgUnsourced*100, 2)."%</h1>"; print "<h1 class='lead'>Percent Original Reporting: ".number_format($avgOriginalReporting*100, 2)."%</h1>"; print "<h1 class='lead'>Percent Primary Source: ".number_format($avgPrimarySource*100, 2)."%</h1>"; print "<h1 class='lead'>Percent Secondary Source: ".number_format($avgSecondarySource*100, 2)."%</h1>"; print "<h1 class='lead'>Percent Quote: ".number_format($avgQuote*100, 2)."%</h1>"; print "<h1 class='lead'>Percent That Needs A Source: ".number_format($avgNeedsSource*100, 2)."%</h1>"; print "<h1 class='lead'>Links: ".number_format($linksPerArticle)."</h1>"; print "<h1 class='lead'>Links per 1000 words: ".number_format($linksPerMWords, 2)."</h1>"; print "<h1 class='lead'>Link Quality: ".number_format($linkQualityAverage, 2)." ($quality)</h1>"; print "<h1 class='lead'>Sources in Text per Article: ".number_format($sourcesPerArticle)."</h1>"; print "<h1 class='lead'>Sources in Text per 1000 Words: ".number_format($sourcesPerMWords)."</h1>"; print "<h1 class='lead'>Word count: ".number_format($wordsPerArticle)."</h1>"; print "<h1 class='lead'>Sentence Count: ".number_format($sentencesPerArticle)."</h1>"; print "<h1 class='lead'>Flesch-Kincaid Grade Level: ".number_format($avgGradeLevel, 1)."</h1>"; } else if(count($articles) == 0){ print "<h1 class='lead'>No Articles to Analyze</h1>"; } } |