<?php
function filter( $item )
{
if (strlen($item) <= 3)
return false;
return true;
}
function strcount( $item )
{
GLOBAL $content;
return Array(substr_count($content, $item), $item);
}
function html2txt($document){
$search = array('@<script[^>]*?>.*?</script>@si', // Strip out javascript
'@<[\/\!]*?[^<>]*?>@si', // Strip out HTML tags
'@<style[^>]*?>.*?</style>@siU', // Strip style tags properly
'@<![\s\S]*?--[ \t\n\r]*>@' // Strip multi-line comments including CDATA
);
$text = preg_replace($search, '', $document);
return $text;
}
/**
* Remove HTML tags, including invisible text such as style and
* script code, and embedded objects. Add line breaks around
* block-level tags to prevent word joining after tag removal.
*/
function strip_html_tags( $text )
{
$text = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
"\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",
"\n\$0", "\n\$0",
),
$text );
return strip_tags( $text );
}
function get_keywords($content, $count)
{
$content = strip_html_tags($content);
$content = html_entity_decode($content);
$arr = str_word_count( $content, 1);
$arr = array_filter($arr, 'filter');
$arr = array_unique($arr);
$arr = array_map('strcount', $arr);
arsort($arr);
foreach($arr as $item)
{
if ($i < $count)
$ret_arr[] = $item[1];
$i++;
}
return implode(',', $ret_arr);
}
$content = file_get_contents('http://www.tankado.com/');
echo get_keywords($content, 5);
?>