/* ----------------------------------------------------- | This file takes an array containing information | | from an RSS feed and returns a list of keywords | | found in the headlines. | | The array provided should be formatted as | | documented in the rssToArray.txt file. | | | | The content analysis is very simple: a file | | words.txt contains a list of common words, one | | per line, in uppercase. Each word in each headline| | is checked against this list and, if found, is | | stripped. | | | | I use the 150 most common words according to the | | FLOB (Freiburg update to the Lunden-Oslo-Bergen | | Corpus of British Engilsh) for this purpose. Due | | to licensing restrictions, I cannot make the list | | pubilcally available, so you will have to create | | your own. | | | | words.txt is prerequisite for this page to work | |___________________________________________________| | This function is (c) Dominic Smith, 2005 | | Enquiries to dom@domsmith.co.uk | | Released under the General Public Licence (GPL) | | See: http://www.gnu.org/licenses/gpl.txt | ----------------------------------------------------- This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ function getHeadlineKWs($rss_array) { $readwords = fopen("words.txt",r) or die("Can't open wordlist"); $wordsArray = array(); while (!feof ($readwords)) { $thisWord = trim(fgets($readwords, 4096)); array_push($wordsArray,$thisWord); } //Debug: print_r ($wordsArray); $headlineKW = array(); for ($i=1; $i<=(count($rss_array)-1); $i++) { $headlineKW[$i] = array(); $thisHeadline[$i] = $rss_array[$i][0]; // Get rid of any numbers, except where preceeded by something (eg. 'G8') $thisHeadline[$i] = preg_replace("/\s+\d+/","",$thisHeadline[$i]); $thisHeadline[$i] = explode(" ",$thisHeadline[$i]); for ($j=0; $j<=(count($thisHeadline[$i])-1); $j++) { // Get rid of any non-alphabet characters, except spaces (we can do this only after the explode()) $testWord = preg_replace("/\W+/","",$thisHeadline[$i][$j]); $testWord = strtoupper($testWord); $testRes = in_array($testWord,$wordsArray); if ($testRes == FALSE) { array_push($headlineKW[$i],$testWord); } } } fclose($readwords); return $headlineKW; }