rework feed content mangling algorithm

This commit is contained in:
Andrew Dolgov 2007-08-21 15:15:50 +01:00
parent 999703d156
commit 2a479dced0
2 changed files with 57 additions and 30 deletions

View File

@ -279,33 +279,6 @@ function _fetch_remote_file ($url, $headers = "" ) {
}
function _convert_entities ($string) {
# Source: http://www.w3.org/TR/REC-html40/sgml/entities.html
$html_entities = array(
"&nbsp", "&iexcl", "&cent", "&pound", "&curren", "&yen", "&brvbar", "&sect", "&uml", "&copy",
"&ordf", "&laquo", "&not", "&shy", "&reg", "&macr", "&deg", "&plusmn", "&sup2", "&sup3",
"&acute", "&micro", "&para", "&middot", "&cedil", "&sup1", "&ordm", "&raquo", "&frac14", "&frac12",
"&frac34", "&iquest", "&Agrave", "&Aacute", "&Acirc", "&Atilde", "&Auml", "&Aring", "&AElig", "&Ccedil",
"&Egrave", "&Eacute", "&Ecirc", "&Euml", "&Igrave", "&Iacute", "&Icirc", "&Iuml", "&ETH", "&Ntilde",
"&Ograve", "&Oacute", "&Ocirc", "&Otilde", "&Ouml", "&times", "&Oslash", "&Ugrave", "&Uacute", "&Ucirc",
"&Uuml", "&Yacute", "&THORN", "&szlig", "&agrave", "&aacute", "&acirc", "&atilde", "&auml", "&aring",
"&aelig", "&ccedil", "&egrave", "&eacute", "&ecirc", "&euml", "&igrave", "&iacute", "&icirc", "&iuml",
"&eth", "&ntilde", "&ograve", "&oacute", "&ocirc", "&otilde", "&ouml", "&divide", "&oslash", "&ugrave",
"&uacute", "&ucirc", "&uuml", "&yacute", "&thorn", "&yuml",);
$numeric_entities = array(
" ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©",
"ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³",
"´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½",
"¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç",
"È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ",
"Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û",
"Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å",
"æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï",
"ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù",
"ú", "û", "ü", "ý", "þ", "ÿ");
return str_replace($html_entities, $numeric_entities, $string);
}
/*=======================================================================*\
Function: _response_to_rss
Purpose: parse an HTTP response object into an RSS object
@ -313,8 +286,7 @@ function _convert_entities ($string) {
Output: parsed RSS object (see rss_parse)
\*=======================================================================*/
function _response_to_rss ($resp) {
$converted_source = _convert_entities($resp->results);
$rss = new MagpieRSS( $converted_source, MAGPIE_OUTPUT_ENCODING, "UTF-8", false);
$rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, "UTF-8", false);
// if RSS parsed successfully
if ( $rss and !$rss->ERROR) {

View File

@ -23,6 +23,35 @@
define('RSS', 'RSS');
define('ATOM', 'Atom');
function _convert_entities ($string) {
# Source: http://www.w3.org/TR/REC-html40/sgml/entities.html
$html_entities = array(
"&nbsp", "&iexcl", "&cent", "&pound", "&curren", "&yen", "&brvbar", "&sect", "&uml", "&copy",
"&ordf", "&laquo", "&not", "&shy", "&reg", "&macr", "&deg", "&plusmn", "&sup2", "&sup3",
"&acute", "&micro", "&para", "&middot", "&cedil", "&sup1", "&ordm", "&raquo", "&frac14", "&frac12",
"&frac34", "&iquest", "&Agrave", "&Aacute", "&Acirc", "&Atilde", "&Auml", "&Aring", "&AElig", "&Ccedil",
"&Egrave", "&Eacute", "&Ecirc", "&Euml", "&Igrave", "&Iacute", "&Icirc", "&Iuml", "&ETH", "&Ntilde",
"&Ograve", "&Oacute", "&Ocirc", "&Otilde", "&Ouml", "&times", "&Oslash", "&Ugrave", "&Uacute", "&Ucirc",
"&Uuml", "&Yacute", "&THORN", "&szlig", "&agrave", "&aacute", "&acirc", "&atilde", "&auml", "&aring",
"&aelig", "&ccedil", "&egrave", "&eacute", "&ecirc", "&euml", "&igrave", "&iacute", "&icirc", "&iuml",
"&eth", "&ntilde", "&ograve", "&oacute", "&ocirc", "&otilde", "&ouml", "&divide", "&oslash", "&ugrave",
"&uacute", "&ucirc", "&uuml", "&yacute", "&thorn", "&yuml",);
$numeric_entities = array(
" ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©",
"ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³",
"´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½",
"¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç",
"È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ",
"Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û",
"Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å",
"æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï",
"ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù",
"ú", "û", "ü", "ý", "þ", "ÿ");
return str_replace($html_entities, $numeric_entities, $string);
}
require_once (MAGPIE_DIR . 'rss_utils.inc');
/**
@ -149,12 +178,14 @@ class MagpieRSS {
$enc = mb_detect_encoding($string);
}
# try fix XML, pass 1
$source = mb_convert_encoding($source, "UTF-8", $enc);
list($parser, $source) = $this->create_parser($source,
$output_encoding, $input_encoding, $detect_encoding);
$this->parser = $parser;
$this->parser = $parser;
xml_set_object( $this->parser, $this );
xml_set_element_handler($this->parser,
@ -163,6 +194,30 @@ class MagpieRSS {
xml_set_character_data_handler( $this->parser, 'feed_cdata' );
$status = xml_parse( $this->parser, $source);
# try to fix XML, pass 2
if (! $status) {
$errorcode = xml_get_error_code( $this->parser );
if ( $errorcode != XML_ERROR_NONE ) {
$source = _convert_entities($source);
list($parser, $source) = $this->create_parser($source,
$output_encoding, $input_encoding, $detect_encoding);
$this->parser = $parser;
xml_set_object( $this->parser, $this );
xml_set_element_handler($this->parser,
'feed_start_element', 'feed_end_element' );
xml_set_character_data_handler( $this->parser, 'feed_cdata' );
$status = xml_parse( $this->parser, $source);
}
}
}
}