/*
-----------------------------------------------------
| This file takes an array containing information   |
| from an RSS feed and returns a list of keywords   |
| found in the headlines.			    |
| The array provided should be formatted as 	    |
| documented in the rssToArray.txt file.	    |
|						    |
| The content analysis is very simple: a file 	    |
| words.txt contains a list of common words, one    |
| per line, in uppercase. Each word in each headline|
| is checked against this list and, if found, is    |
| stripped.					    |
|						    |
| I use the 150 most common words according to the  |
| FLOB (Freiburg update to the Lunden-Oslo-Bergen   |
| Corpus of British Engilsh) for this purpose.  Due |
| to licensing restrictions, I cannot make the list |
| pubilcally available, so you will have to create  |
| your own.					    |
|						    |
| words.txt is prerequisite for this page to work   |
|___________________________________________________|
| This function is (c) Dominic Smith, 2005 	    |
| Enquiries to dom@domsmith.co.uk		    |
| Released under the General Public Licence (GPL)   |
| See: http://www.gnu.org/licenses/gpl.txt	    |
-----------------------------------------------------


    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/

function getHeadlineKWs($rss_array)

{

$readwords = fopen("words.txt",r) or die("Can't open wordlist");
$wordsArray = array();
while (!feof ($readwords)) 
	{ 
	$thisWord = trim(fgets($readwords, 4096));
	array_push($wordsArray,$thisWord); 
	} 
//Debug: print_r ($wordsArray);

$headlineKW = array();

for ($i=1; $i<=(count($rss_array)-1); $i++)
	{
	$headlineKW[$i] = array();
	$thisHeadline[$i] = $rss_array[$i][0];
	// Get rid of any numbers, except where preceeded by something (eg. 'G8')
	$thisHeadline[$i] = preg_replace("/\s+\d+/","",$thisHeadline[$i]);
	$thisHeadline[$i] = explode(" ",$thisHeadline[$i]);
	for ($j=0; $j<=(count($thisHeadline[$i])-1); $j++)
		{
		// Get rid of any non-alphabet characters, except spaces (we can do this only after the explode())
		$testWord = preg_replace("/\W+/","",$thisHeadline[$i][$j]);
		$testWord = strtoupper($testWord);
		$testRes = in_array($testWord,$wordsArray);
		if ($testRes == FALSE)
			{
			array_push($headlineKW[$i],$testWord);
			}
		}
	}

fclose($readwords);
return $headlineKW;

}