3 * Copyright © 2004 Reini Urban
5 * This file is part of PhpWiki.
7 * PhpWiki is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * PhpWiki is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with PhpWiki; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 * SPDX-License-Identifier: GPL-2.0-or-later
26 * HtmlParser Class: Conversion HTML => wikimarkup
27 * Requires PhpWikiXmlParser, XmlElement and the expat (or now the libxml) library. This is all in core.
31 * Base class to implement html => wikitext converters,
32 * extendable for various wiki syntax versions.
33 * This is needed to be able to use htmlarea-alike editors,
34 * and to import XML or HTML documents.
36 * See also php-html.sf.net for a php-only version, if
37 * you don't have the expat/libxml extension included.
38 * See also http://search.cpan.org/~diberri/HTML-WikiConverter/
42 // RssParser contains the XML (expat) and url-grabber methods
43 require_once 'lib/PhpWikiXmlParser.php';
46 extends PhpWikiXmlParser
48 public $dialect, $_handlers, $root;
50 function __construct($dialect, $encoding = '')
52 $this->dialect = new HtmlParser_PhpWiki();
53 $this->_handlers =& $this->dialect->_handlers;
54 $this->PhpWikiXmlParser($encoding);
55 xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, 0);
56 xml_parser_set_option($this->_parser, XML_OPTION_SKIP_WHITE, 1);
59 // The three callbacks, called on walking through the HTML tree.
60 // No extensions needed from PhpWikiXmlParser.
62 function tag_open($parser, $name, $attrs='') {
64 function tag_close($parser, $name, $attrs='') {
66 function cdata($parser, $data) {
68 function parse_url($file, $debug=false)
73 if (is_null($this->root))
74 $this->root = $GLOBALS['xml_parser_root'];
75 $output = $this->wikify($this->root);
80 * @param HtmlElement $node
81 * @param HtmlElement $parent
84 function wikify($node, $parent = null)
87 if (is_a($node, 'XmlElement')) {
88 $dialect =& $this->dialect;
89 $conv = $dialect->_handlers[$node->_tag];
90 if (is_string($conv) and method_exists($dialect, $conv)) {
91 $output = $dialect->$conv($node);
92 } elseif (is_array($conv)) {
93 foreach ($node->getContent() as $n) {
94 $output .= $this->wikify($n, $node);
96 $output = $conv[0] . $output . $conv[count($conv) - 1];
97 } elseif (!empty($conv)) {
99 foreach ($node->getContent() as $n) {
100 $output .= $this->wikify($n, $node);
103 foreach ($node->getContent() as $n) {
104 $output .= $this->wikify($n, $node);
109 if ($parent and $parent->_tag != 'pre')
110 preg_replace("/ {2,}/", " ", $output);
111 if (trim($output) == '')
118 * $output = $parser->elem_contents( $elem );
119 * Returns a wikified version of the contents of the specified
120 * HTML element. This is done by passing each element of this
121 * element's content list through the C<wikify()> method, and
122 * returning the concatenated result.
124 * @param HtmlElement $node
127 function elem_contents($node)
130 if (is_a($node, 'XmlElement')) {
131 foreach ($node->getContent() as $child) {
132 $output .= $this->wikify($child, isset($node->parent) ? $node->parent : null);
135 $output = $this->wikify($content);
141 // Private function: _elem_attr_str( $elem, @attrs )
143 // Returns a string containing a list of attribute names and
144 // values associated with the specified HTML element. Only
145 // attribute names included in @attrs will be added to the
146 // string of attributes that is returned. The return value
147 // is suitable for inserting into an HTML document, as
148 // attribute name/value pairs are specified in attr="value"
151 function _elem_attr_str($node, $attrs)
154 foreach ($node->_attr as $attr => $val) {
155 $attr = strtolower($attr);
156 if (in_array($attr, $attrs))
157 $s .= " $attr=\"$val\"";
163 // Private function: _elem_has_ancestor( $elem, $tagname )
165 // Returns true if the specified HtmlElement has an ancestor element
166 // whose element tag equals $tag. This is useful for determining if
167 // an element belongs to the specified tag.
169 function _elem_has_ancestor($node, $tag)
171 if (isset($node->parent)) {
172 if ($node->parent->_tag == $tag) return true;
173 return $this->_elem_has_ancestor($node->parent, $tag);
179 // Private function: _elem_is_image_div( $elem )
181 // Returns true $elem is a container element (P or DIV) meant only to
184 // More specifically, returns true if the given element is a DIV or P
185 // element and the only child it contains is an IMG tag or an IMG tag
186 // contained within a sole A tag (not counting child elements with
187 // whitespace text only).
190 * @param HtmlElement $node
193 function _elem_is_image_div($node)
195 // Return false if node is undefined or isn't a DIV at all
196 if (!$node or !in_array($node->_tag, array("div", "p")))
198 $contents = $node->getContent();
199 // Returns true if sole child is an IMG tag
200 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'img')
202 // Check if child is a sole A tag that contains an IMG tag
203 if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'a') {
204 $children = $contents[0]->getContent();
205 if (count($children) == 1 and isset($children[0]) and $children[0]->_tag == 'img')
212 * preserves tags and content
214 * @param HtmlElement $node
217 function wikify_default($node)
219 return $this->wikify_preserve($node);
223 * preserves tags and content
225 * @param HtmlElement $node
228 function wikify_preserve($node)
230 return $node->asXML();
238 class HtmlParser_PhpWiki
243 function __construct()
256 'strong' => array("*"),
261 // PRE blocks are handled specially (see tidy_whitespace and
263 'pre' => array("<pre>", "</pre>"),
265 'dl' => array('', "\n\n"),
266 'dt' => array(';', ''),
267 'dd' => array(':', ''),
269 'p' => array("\n\n", "\n\n"),
270 'ul' => array('', "\n"),
271 'ol' => array('', "\n"),
273 'li' => "wikify_list_item",
274 'table' => "wikify_table",
278 'div' => array('', "\n\n"),
279 'img' => "wikify_img",
280 'a' => "wikify_link",
281 'span' => array('', ''),
290 'font' => array('', ''),
291 'sup' => "wikify_default",
292 'sub' => "wikify_default",
293 'nowiki' => "wikify_verbatim",
294 'verbatim' => "wikify_default",
295 'noinclude' => "wikify_noinclude",
300 * @param HtmlElement $node
303 function wikify_table($node)
306 return "| \n" . $this->elem_contents($node) . "|\n\n";
310 * @param HtmlElement $node
313 function wikify_tr($node)
315 return "\n| " . $this->elem_contents($node);
319 * @param HtmlElement $node
322 function wikify_th($node)
324 $ident = empty($this->ident) ? '' : $this->ident;
325 $output = "$ident| ";
326 $content = $this->elem_contents($node);
327 preg_replace("s/^\s+/", "", $content);
330 return "$output |\n";
334 * @param HtmlElement $node
337 function wikify_list_item($node)
339 return ($this->_elem_has_ancestor($node, 'ol') ? '*' : '#') . " " . trim($this->elem_contents($node)) . "\n";
343 * @param HtmlElement $node
346 function wikify_link($node)
348 $url = $this->absolute_url($node->getAttr('href'));
349 $title = $this->elem_contents($node);
351 $title = trim($title);
353 // Just return the link title if this tag is contained
354 // within an header tag
355 if (isset($node->parent) and preg_match('/^h\d$/', $node->parent->_tag))
358 // Return if this is a link to an image contained within
359 if (isset($node->parent) and $this->_elem_is_image_div($node->parent))
362 // If HREF is the same as the link title, then
363 // just return the URL (it'll be converted into
364 // a clickable link by the wiki engine)
365 if ($url == $title) return $url;
366 return "[ $url | $title ]";
370 * @param HtmlElement $node
373 function wikify_h($node)
375 $level = substr($node->_tag, 1);
377 $markup = str_repeat('!', 4 - $level);
381 return $markup . ' ' . trim($this->elem_contents($node)) . "\n\n";
385 * @param HtmlElement $node
388 function wikify_verbatim($node)
390 $contents = $this->elem_contents($node);
391 return "\n<".'verbatim'.">\n$contents\n</"."verbatim>";
395 * @param HtmlElement $node
398 function wikify_noinclude($node)
400 return $this->elem_contents($node);
404 * @param HtmlElement $node
407 function wikify_img($node)
409 $image_url = $this->absolute_url($node->getAttr('src'));
410 $file = basename($image_url);
411 $alignment = $node->getAttr('align');
412 $this->log("Processing IMG tag for SRC: " . $image_url . "...");
414 // Grab attributes to be added to the [ Image ] markup (since 1.3.10)
417 if ($this->_elem_is_image_div($node->parent))
418 $image_div = $node->parent;
419 elseif (isset($node->parent) and $this->_elem_is_image_div($node->parent->parent))
420 $image_div = $node->parent->parent;
422 if (!$alignment and $image_div) {
423 $css_style = $image_div->getAttr('style');
424 $css_class = $image_div->getAttr('class');
426 // float => align: Check for float attribute; if it's there,
427 // then we'll add it to the [Image] syntax
428 if (!$alignment and preg_match("/float\:\s*(right|left)/i", $css_style, $m))
430 if (!$alignment and preg_match("/float(right|left)/i", $css_class, $m)) ;
433 $attrs[] = "class=align-$alignment";
434 $this->log(" Image is contained within a DIV that specifies $alignment alignment");
435 $this->log(" Adding '$alignment' to [Image] markup attributes");
437 $this->log(" Image is not contained within a DIV for alignment");
440 $this->log(" Image is not contained within a DIV");
443 $attrs[] = "class=align-$alignment";
445 // Check if we need to request a thumbnail of this
446 // image; it's needed if the specified width attribute
447 // differs from the default size of the image
449 if ($width = $node->getAttr('width')) {
450 $this->log(" Image has WIDTH attribute of $width");
451 $this->log(" Checking whether resulting [Image] markup should specify a thumbnail...");
453 // Download the image from the network and store
454 $abs_url = $this->absolute_url($node->getAttr('src'));
455 $this->log(" Fetching image '$abs_url' from the network");
456 list($actual_w, $actual_h, $flag, $attr_str) = getimagesize($abs_url);
458 // If the WIDTH attribute of the IMG tag is not equal
459 // to the actual width of the image, then we need to
460 // create a thumbnail
461 if (preg_match("/^\d+$/", $width) and $width != $actual_w) {
462 $this->log(" IMG tag's WIDTH attribute ($width) differs from actual width of image ($actual_w)");
463 $this->log(" -- that means we're going to need a thumbnail");
464 $this->log(" Adding 'width' to list of attributes for [Image] markup");
465 $attrs[] = "width=$width";
468 $height = $node->getAttr('height');
469 if (preg_match("/^\d+$/", $height) and $height != $height_h) {
470 $this->log(" IMG tag's HEIGHT attribute ($height) differs from actual height of image ($actual_h)");
471 $this->log(" -- that means we're going to need a thumbnail");
472 $this->log(" Adding 'height' to list of attributes for [Image] markup");
473 if (isset($width_added))
474 $attrs[count($attr) - 1] = "size=" . $width . "x" . $height;
476 $attrs[] = "height=$height";
479 if ($alt = $node->getAttr('alt')) {
480 $this->log(" Adding alternate text '$alt' to [Image] markup");
481 $attrs[] = "alt=$alt";
483 $attr_str = join(' ', $attrs);
484 $this->log("...done processing IMG tag\n");
485 return "[ $file $attr_str ]";