TRUE,
'doctype' => "omit",
'drop-empty-paras' => TRUE,
'drop-font-tags' => TRUE,
'drop-proprietary-attributes' => TRUE,
'fix-backslash' => TRUE,
'hide-comments' => TRUE,
'fix-backslash' => TRUE,
'fix-uri' => TRUE,
'logical-emphasis' => TRUE,
'numeric-entities' => TRUE, // I don't want html entities!
'output-xhtml' => TRUE,
'show-body-only' => TRUE,
'uppercase-attributes' => FALSE,
'uppercase-tags' => FALSE,
// Pretty print stuff. Not really useful, just not to have too big lines.
'wrap' => 200);
//return tidy_repair_string ($badxhtml, $config, 'utf8');
$ret = tidy_repair_string ($badxhtml, $config, 'utf8');
return $ret;
}
else
return false;
// TODO: make a manual (but probably less powerful) method for tidy library is not installed?
//$tidy = new tidy;
//$tidy->parseString($xhtml, $config, 'utf8');
//$tidy->cleanRepair();
} // }}}
// these 2 variables should not be used elsewhere than in "xhtml2xhtmlim" or "xhtml2bare"...
// I could not find any other way to use a common data to all handler functions (for XML parsing)
// than make global variables...
// Is there any nicer workaround?
$xhtmlim = "";
$stack = array ();
// This function transforms $xhtml, which is a normal xhtml content in the corresponding xhtml-im content.
// It supports currently only the core module of XEP-0071.
// cf. http://www.xmpp.org/extensions/xep-0071.html
// This first version does not fix badly html originally + do not remove illegal characters.
// http://openweb.eu.org/articles/xhtml_une_heure/
function xhtml2xhtmlim ($xhtml) // {{{
{
global $xhtmlim;
$xhtml = fixxhtml ($xhtml);
if ($xhtml == false)
// no need to continue if I cannot even check xhtml integrity...
return false;
// From now on, I am supposing the xhtml is compliant.
// Or else it means "tidy" is bugged because I use the tidy package for this.
//$xhtmlim = "";
//$stack = array ();
if (!function_exists ("start_handler"))
{
function start_handler ($parser, $name, $attrs) // {{{
{
global $xhtmlim;
global $stack;
// I don't have to ignore head, html and title elements in the context of this plugin,
// as they are anyway (normally at least) not present in a post content.
// TODO: section 7.2 -> only br, p and span? What about b, em and hX especially?!!
// Section 7.2: br, and p only for now.
if ($name == "br")
{
// no need to push on the stack as "normally" if tidy made well its job, it will close immediately.
// But anyway, no risk to do it...
array_push ($stack, false);
$xhtmlim .= "
";
}
elseif ($name == "p")
{
array_push ($stack, true);
$xhtmlim .= "
";
}
elseif ($name == "strong" || $name == "em" || preg_match ("/^h[1-6]$/", $name) > 0)
{
array_push ($stack, true);
$xhtmlim .= '<' . $name . '>';
}
// Section 7.3: only a with mandatory "href" and recommended "type".
elseif ($name == "a")
{
if (array_key_exists ('href', $attrs))
{
array_push ($stack, true);
$xhtmlim .= '';
else
$xhtmlim .= '">"';
}
else
array_push ($stack, false);
}
// section 7.4: only ol, ul and li recommended (what about "title" and accesskey for accessibility?!).
// And why not def list? This is just done for IM but XMPP is more than just IM.
elseif ($name == "ol" || $name == "ul" || $name == "li")
{
array_push ($stack, true);
$xhtmlim .= '<' . $name . '>';
}
elseif ($name == "img")
{
if (array_key_exists ('src', $attrs) && array_key_exists ('alt', $attrs))
{
array_push ($stack, true);
$xhtmlim .= '';
}
else
array_push ($stack, false);
}
else
array_push ($stack, false);
} // }}}
}
if (!function_exists ("end_handler"))
{
function end_handler ($parser, $name) // {{{
{
global $xhtmlim;
global $stack;
$last_element_has_been_displayed = array_pop ($stack);
if ($last_element_has_been_displayed)
$xhtmlim .= "" . $name . ">";
} // }}}
}
if (!function_exists ("cdata_handler"))
{
function cdata_handler ($parser, $data) // {{{
{
global $xhtmlim;
$xhtmlim .= $data;
} // }}}
}
$xml_parser = xml_parser_create("UTF-8");
xml_parser_set_option ($xml_parser, XML_OPTION_CASE_FOLDING, 0);
xml_set_element_handler ($xml_parser, "start_handler", "end_handler");
xml_set_character_data_handler ($xml_parser, "cdata_handler");
$parse_status = xml_parse ($xml_parser, "$xhtml", TRUE);
xml_parser_free ($xml_parser);
$ret_value = "" . $xhtmlim . '';
$xhtmlim = "";
$stack = array ();
if ($parse_status == XML_STATUS_ERROR)
return FALSE;
//$xhtmlim = html_entity_decode ($xhtml, ENT_QUOTES, "UTF-8");
// maybe should I use char-encoding and input-encoding options of tidy instead?
// '&' and '<' are the only characters which must be transformed in & and <.
// The rest is utf-8, so I let them in their equivalent utf-8 (by html_entity_decode).
// TODO: test œ
// numeric entities?!
return $ret_value;
} // }}}
function xhtml2bare ($xhtml) // {{{ Todo: shouldn't I rather use again the xml parser?!!
{
global $xhtmlim;
global $stack;
$fixed_html = fixxhtml ($xhtml);
// TODO: there must be a guard to prevent from redefine this, for instance function_exists).
if (!function_exists ("start_bare_handler"))
{
function start_bare_handler ($parser, $name, $attrs) // {{{
{
global $xhtmlim;
global $stack;
if ($name == "br")
{
array_push ($stack, false);
$xhtmlim .= "\n";
}
elseif ($name == "p")
{
$xhtmlim .= "\n\t";
array_push ($stack, false);
}
elseif (preg_match ("/^h[1-6]$/", $name) > 0)
{
$xhtmlim .= "\n";
$value = intval ($name[1]);
for ($i = 0; $i < $value; $i++)
$xhtmlim .= "=";
array_push ($stack, false);
}
elseif ($name == "strong" || $name == "em")
array_push ($stack, false);
elseif ($name == "a")
{
if (array_key_exists ('href', $attrs))
{
array_push ($stack, $attrs['href']); // shouldn't I keep the "href" value and write it at the end?!!
//$xhtmlim .= '';
//else
// $xhtmlim .= '">"';
}
else
array_push ($stack, false);
}
// And why not def list? This is just done for IM but XMPP is more than just IM.
elseif ($name == "ol")
{
array_push ($stack, 1);
$xhtmlim .= "\n";
}
elseif ($name == "ul")
{
array_push ($stack, false);
$xhtmlim .= "\n";
}
elseif ($name == "li")
{
//$xhtmlim .= "-"; // should'nt I differentiate ol from ul?!! #1#
$num = array_pop ($stack);
if ($num == false)
{
$xhtmlim .= "\t⇒ ";
array_push ($stack, false);
array_push ($stack, false);
}
else
{
$xhtmlim .= "\t#" . strval ($num) . "# ";
array_push ($stack, $num + 1);
array_push ($stack, false);
}
}
elseif ($name == "blockquote" || $name == "code")
{
array_push ($stack, false);
$xhtmlim .= "\n«\n";
}
else
array_push ($stack, false);
} // }}}
}
if (!function_exists ("end_bare_handler"))
{
function end_bare_handler ($parser, $name) // {{{
{
global $xhtmlim;
global $stack;
if ($name == "br" || $name == "strong" || $name == "em")
array_pop ($stack);
elseif ($name == "p")
{
$xhtmlim .= "\n";
array_pop ($stack);
}
elseif (preg_match ("/^h[1-6]$/", $name) > 0)
{
array_pop ($stack);
$value = intval ($name[1]);
for ($i = 0; $i < $value; $i++)
$xhtmlim .= "=";
$xhtmlim .= "\n";
}
elseif ($name == "a")
{
$link = array_pop ($stack);
if ($link != false)
$xhtmlim .= " [ $link ] ";
}
elseif ($name == "ol" || $name == "ul" || $name == "li")
{
$link = array_pop ($stack);
$xhtmlim .= "\n";
}
elseif ($name == "blockquote" || $name == "code")
{
$xhtmlim .= "\n»\n";
array_pop ($stack);
}
else
array_pop ($stack);
//$must_go_to_line = array_pop ($stack);
//if ($must_go_to_line == true)
// $xhtmlim .= "\n";
//elseif ($must_go_to_line == false)
// ;
//else
// $xhtmlim .= " [ " . $must_go_to_line . " ] ";
} // }}}
}
if (!function_exists ("cdata_bare_handler"))
{
function cdata_bare_handler ($parser, $data) // {{{
{
global $xhtmlim;
$xhtmlim .= $data;
} // }}}
}
if ($fixed_html != false)
{
$xml_parser = xml_parser_create("UTF-8");
xml_set_element_handler ($xml_parser, "start_bare_handler", "end_bare_handler");
xml_set_character_data_handler ($xml_parser, "cdata_bare_handler");
xml_parser_set_option ($xml_parser, XML_OPTION_CASE_FOLDING, 0);
$parse_status = xml_parse ($xml_parser, "$xhtml", TRUE);
xml_parser_free ($xml_parser);
$pattern[0] = '/\n(\s*\n)+/';
$replacement[0] = "\n\n"; // no more than 2 white lines and ending white spaces (\t for