5 * @author Benjamin Nowack <bnowack@semsol.com>
6 * @license http://arc.semsol.org/license
7 * @homepage <http://arc.semsol.org/>
12 ARC2::inc('RDFParser');
14 class ARC2_RDFXMLParser extends ARC2_RDFParser {
16 function __construct($a = '', &$caller) {
17 parent::__construct($a, $caller);
20 function ARC2_RDFXMLParser($a = '', &$caller) {
21 $this->__construct($a, $caller);
24 function __init() {/* reader */
26 $this->encoding = $this->v('encoding', false, $this->a);
29 $this->x_base = $this->base;
30 $this->xml = 'http://www.w3.org/XML/1998/namespace';
31 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
32 $this->nsp = array($this->xml => 'xml', $this->rdf => 'rdf');
33 $this->s_stack = array();
35 $this->target_encoding = '';
40 function parse($path, $data = '', $iso_fallback = false) {
42 if (!$this->v('reader')) {
44 $this->reader = & new ARC2_Reader($this->a, $this);
46 $this->reader->setAcceptHeader('Accept: application/rdf+xml; q=0.9, */*; q=0.1');
47 $this->reader->activate($path, $data);
48 $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
50 $this->initXMLParser();
53 while ($d = $this->reader->readStream()) {
54 if (!$this->keep_time_limit) @set_time_limit($this->v('time_limit', 60, $this->a));
55 if ($iso_fallback && $first) {
56 $d = '<?xml version="1.0" encoding="ISO-8859-1"?>' . "\n" . preg_replace('/^\<\?xml [^\>]+\?\>\s*/s', '', $d);
59 if (!xml_parse($this->xml_parser, $d, false)) {
60 $error_str = xml_error_string(xml_get_error_code($this->xml_parser));
61 $line = xml_get_current_line_number($this->xml_parser);
62 $this->tmp_error = 'XML error: "' . $error_str . '" at line ' . $line . ' (parsing as ' . $this->getEncoding() . ')';
63 if (!$iso_fallback && preg_match("/Invalid character/i", $error_str)) {
64 xml_parser_free($this->xml_parser);
65 unset($this->xml_parser);
66 $this->reader->closeStream();
68 $this->encoding = 'ISO-8859-1';
69 unset($this->xml_parser);
71 return $this->parse($path, $data, true);
74 return $this->addError($this->tmp_error);
78 $this->target_encoding = xml_parser_get_option($this->xml_parser, XML_OPTION_TARGET_ENCODING);
79 xml_parser_free($this->xml_parser);
80 $this->reader->closeStream();
87 function initXMLParser() {
88 if (!isset($this->xml_parser)) {
89 $enc = preg_match('/^(utf\-8|iso\-8859\-1|us\-ascii)$/i', $this->getEncoding(), $m) ? $m[1] : 'UTF-8';
90 $parser = xml_parser_create_ns($enc, '');
91 xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 0);
92 xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
93 xml_set_element_handler($parser, 'open', 'close');
94 xml_set_character_data_handler($parser, 'cdata');
95 xml_set_start_namespace_decl_handler($parser, 'nsDecl');
96 xml_set_object($parser, $this);
97 $this->xml_parser =& $parser;
103 function getEncoding($src = 'config') {
104 if ($src == 'parser') {
105 return $this->target_encoding;
107 elseif (($src == 'config') && $this->encoding) {
108 return $this->encoding;
110 return $this->reader->getEncoding();
115 function getTriples() {
116 return $this->v('triples', array());
119 function countTriples() {
120 return $this->t_count;
125 function pushS(&$s) {
126 $s['pos'] = $this->s_count;
127 $this->s_stack[$this->s_count] = $s;
131 function popS(){/* php 4.0.x-safe */
134 for ($i = 0, $i_max = $this->s_count; $i < $i_max; $i++) {
135 $r[$i] = $this->s_stack[$i];
140 function updateS($s) {
141 $this->s_stack[$s['pos']] = $s;
144 function getParentS() {
145 return ($this->s_count && isset($this->s_stack[$this->s_count - 1])) ? $this->s_stack[$this->s_count - 1] : false;
148 function getParentXBase() {
149 if ($p = $this->getParentS()) {
150 return isset($p['p_x_base']) && $p['p_x_base'] ? $p['p_x_base'] : (isset($p['x_base']) ? $p['x_base'] : '');
152 return $this->x_base;
155 function getParentXLang() {
156 if ($p = $this->getParentS()) {
157 return isset($p['p_x_lang']) && $p['p_x_lang'] ? $p['p_x_lang'] : (isset($p['x_lang']) ? $p['x_lang'] : '');
159 return $this->x_lang;
164 function addT($s, $p, $o, $s_type, $o_type, $o_dt = '', $o_lang = '') {
165 //echo "-----\nadding $s / $p / $o\n-----\n";
166 $t = array('s' => $s, 'p' => $p, 'o' => $o, 's_type' => $s_type, 'o_type' => $o_type, 'o_datatype' => $o_dt, 'o_lang' => $o_lang);
167 if ($this->skip_dupes) {
168 $h = md5(serialize($t));
169 if (!isset($this->added_triples[$h])) {
170 $this->triples[$this->t_count] = $t;
172 $this->added_triples[$h] = true;
176 $this->triples[$this->t_count] = $t;
181 function reify($t, $s, $p, $o, $s_type, $o_type, $o_dt = '', $o_lang = '') {
182 $this->addT($t, $this->rdf.'type', $this->rdf.'Statement', 'uri', 'uri');
183 $this->addT($t, $this->rdf.'subject', $s, 'uri', $s_type);
184 $this->addT($t, $this->rdf.'predicate', $p, 'uri', 'uri');
185 $this->addT($t, $this->rdf.'object', $o, 'uri', $o_type, $o_dt, $o_lang);
190 function open($p, $t, $a) {
191 //echo "state is $this->state\n";
192 //echo "opening $t\n";
193 switch($this->state) {
194 case 0: return $this->h0Open($t, $a);
195 case 1: return $this->h1Open($t, $a);
196 case 2: return $this->h2Open($t, $a);
197 case 4: return $this->h4Open($t, $a);
198 case 5: return $this->h5Open($t, $a);
199 case 6: return $this->h6Open($t, $a);
200 default: $this->addError('open() called at state ' . $this->state . ' in '.$t);
204 function close($p, $t) {
205 //echo "state is $this->state\n";
206 //echo "closing $t\n";
207 switch($this->state){
208 case 1: return $this->h1Close($t);
209 case 2: return $this->h2Close($t);
210 case 3: return $this->h3Close($t);
211 case 4: return $this->h4Close($t);
212 case 5: return $this->h5Close($t);
213 case 6: return $this->h6Close($t);
214 default: $this->addError('close() called at state ' . $this->state . ' in '.$t);
218 function cdata($p, $d) {
219 //echo "state is $this->state\n";
221 switch($this->state){
222 case 4: return $this->h4Cdata($d);
223 case 6: return $this->h6Cdata($d);
224 default: return false;
228 function nsDecl($p, $prf, $uri) {
229 $this->nsp[$uri] = isset($this->nsp[$uri]) ? $this->nsp[$uri] : $prf;
234 function h0Open($t, $a) {
235 $this->x_lang = $this->v($this->xml.'lang', $this->x_lang, $a);
236 $this->x_base = $this->calcURI($this->v($this->xml.'base', $this->x_base, $a));
238 if ($t !== $this->rdf.'RDF') {
239 $this->h1Open($t, $a);
245 function h1Open($t, $a) {
247 'x_base' => isset($a[$this->xml.'base']) ? $this->calcURI($a[$this->xml.'base']) : $this->getParentXBase(),
248 'x_lang' => isset($a[$this->xml.'lang']) ? $a[$this->xml.'lang'] : $this->getParentXLang(),
252 if (isset($a[$this->rdf.'ID'])) {
254 $s['value'] = $this->calcURI('#'.$a[$this->rdf.'ID'], $s['x_base']);
257 elseif (isset($a[$this->rdf.'about'])) {
259 $s['value'] = $this->calcURI($a[$this->rdf.'about'], $s['x_base']);
263 $s['type'] = 'bnode';
264 if (isset($a[$this->rdf.'nodeID'])) {
265 $s['value'] = '_:'.$a[$this->rdf.'nodeID'];
268 $s['value'] = $this->createBnodeID();
272 if ($this->state === 4) {
273 $sup_s = $this->getParentS();
275 if (isset($sup_s['o_is_coll']) && $sup_s['o_is_coll']) {
276 $coll = array('value' => $this->createBnodeID(), 'type' => 'bnode', 'is_coll' => true, 'x_base' => $s['x_base'], 'x_lang' => $s['x_lang']);
277 $this->addT($sup_s['value'], $sup_s['p'], $coll['value'], $sup_s['type'], $coll['type']);
278 $this->addT($coll['value'], $this->rdf . 'first', $s['value'], $coll['type'], $s['type']);
281 /* new entry in existing coll */
282 elseif (isset($sup_s['is_coll']) && $sup_s['is_coll']) {
283 $coll = array('value' => $this->createBnodeID(), 'type' => 'bnode', 'is_coll' => true, 'x_base' => $s['x_base'], 'x_lang' => $s['x_lang']);
284 $this->addT($sup_s['value'], $this->rdf . 'rest', $coll['value'], $sup_s['type'], $coll['type']);
285 $this->addT($coll['value'], $this->rdf . 'first', $s['value'], $coll['type'], $s['type']);
288 /* normal sub-node */
289 elseif(isset($sup_s['p']) && $sup_s['p']) {
290 $this->addT($sup_s['value'], $sup_s['p'], $s['value'], $sup_s['type'], $s['type']);
294 if ($t !== $this->rdf.'Description') {
295 $this->addT($s['value'], $this->rdf.'type', $t, $s['type'], 'uri');
297 /* (additional) typing attr */
298 if (isset($a[$this->rdf.'type'])) {
299 $this->addT($s['value'], $this->rdf.'type', $a[$this->rdf.'type'], $s['type'], 'uri');
302 if (in_array($t, array($this->rdf.'Seq', $this->rdf.'Bag', $this->rdf.'Alt'))) {
305 /* any other attrs (skip rdf and xml, except rdf:_, rdf:value, rdf:Seq) */
306 foreach($a as $k => $v) {
307 if (((strpos($k, $this->xml) === false) && (strpos($k, $this->rdf) === false)) || preg_match('/(\_[0-9]+|value|Seq|Bag|Alt|Statement|Property|List)$/', $k)) {
308 if (strpos($k, ':')) {
309 $this->addT($s['value'], $k, $v, $s['type'], 'literal', '', $s['x_lang']);
319 function h2Open($t, $a) {
320 $s = $this->getParentS();
321 foreach (array('p_x_base', 'p_x_lang', 'p_id', 'o_is_coll') as $k) {
325 if (isset($a[$this->xml.'base'])) {
326 $s['p_x_base'] = $this->calcURI($a[$this->xml.'base'], $s['x_base']);
328 $b = isset($s['p_x_base']) && $s['p_x_base'] ? $s['p_x_base'] : $s['x_base'];
330 if (isset($a[$this->xml.'lang'])) {
331 $s['p_x_lang'] = $a[$this->xml.'lang'];
333 $l = isset($s['p_x_lang']) && $s['p_x_lang'] ? $s['p_x_lang'] : $s['x_lang'];
335 if ($t === $this->rdf.'li') {
337 $t = $this->rdf.'_'.$s['li_count'];
342 if (isset($a[$this->rdf.'ID'])) {
343 $s['p_id'] = $a[$this->rdf.'ID'];
345 $o = array('value' => '', 'type' => '', 'x_base' => $b, 'x_lang' => $l);
346 /* resource/rdf:resource */
347 if (isset($a['resource'])) {
348 $a[$this->rdf . 'resource'] = $a['resource'];
349 unset($a['resource']);
351 if (isset($a[$this->rdf.'resource'])) {
352 $o['value'] = $this->calcURI($a[$this->rdf.'resource'], $b);
354 $this->addT($s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
356 if (isset($a[$this->rdf.'type'])) {
357 $this->addT($o['value'], $this->rdf.'type', $a[$this->rdf.'type'], 'uri', 'uri');
360 if (isset($s['p_id'])) {
361 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
367 elseif (isset($a[$this->rdf.'nodeID'])) {
368 $o['value'] = '_:' . $a[$this->rdf.'nodeID'];
369 $o['type'] = 'bnode';
370 $this->addT($s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
373 if (isset($s['p_id'])) {
374 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
378 elseif (isset($a[$this->rdf.'parseType'])) {
379 if ($a[$this->rdf.'parseType'] === 'Literal') {
380 $s['o_xml_level'] = 0;
381 $s['o_xml_data'] = '';
382 $s['p_xml_literal_level'] = 0;
386 elseif ($a[$this->rdf.'parseType'] === 'Resource') {
387 $o['value'] = $this->createBnodeID();
388 $o['type'] = 'bnode';
389 $o['has_closing_tag'] = 0;
390 $this->addT($s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
393 if (isset($s['p_id'])) {
394 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
399 elseif ($a[$this->rdf.'parseType'] === 'Collection') {
400 $s['o_is_coll'] = true;
404 /* sub-node or literal */
407 if (isset($a[$this->rdf.'datatype'])) {
408 $s['o_datatype'] = $a[$this->rdf.'datatype'];
412 /* any other attrs (skip rdf and xml) */
413 foreach($a as $k => $v) {
414 if (((strpos($k, $this->xml) === false) && (strpos($k, $this->rdf) === false)) || preg_match('/(\_[0-9]+|value)$/', $k)) {
415 if (strpos($k, ':')) {
417 $o['value'] = $this->createBnodeID();
418 $o['type'] = 'bnode';
419 $this->addT($s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
422 if (isset($s['p_id'])) {
423 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $o['value'], $s['type'], $o['type']);
426 $this->addT($o['value'], $k, $v, $o['type'], 'literal');
436 function h4Open($t, $a) {
437 return $this->h1Open($t, $a);
442 function h5Open($t, $a) {
444 return $this->h4Open($t, $a);
449 function h6Open($t, $a) {
450 $s = $this->getParentS();
451 $data = isset($s['o_xml_data']) ? $s['o_xml_data'] : '';
452 $ns = isset($s['ns']) ? $s['ns'] : array();
453 $parts = $this->splitURI($t);
454 if (count($parts) === 1) {
460 if (!isset($this->nsp[$ns_uri])) {
461 foreach ($this->nsp as $tmp1 => $tmp2) {
462 if (strpos($t, $tmp1) === 0) {
464 $name = substr($t, strlen($tmp1));
469 $nsp = $this->nsp[$ns_uri];
470 $data .= $nsp ? '<' . $nsp . ':' . $name : '<' . $name;
472 if (!isset($ns[$nsp.'='.$ns_uri]) || !$ns[$nsp.'='.$ns_uri]) {
473 $data .= $nsp ? ' xmlns:'.$nsp.'="'.$ns_uri.'"' : ' xmlns="'.$ns_uri.'"';
474 $ns[$nsp.'='.$ns_uri] = true;
478 foreach ($a as $k => $v) {
479 $parts = $this->splitURI($k);
480 if (count($parts) === 1) {
481 $data .= ' '.$k.'="'.$v.'"';
486 $nsp = $this->nsp[$ns_uri];
487 $data .= $nsp ? ' '.$nsp.':'.$name.'="'.$v.'"' : ' '.$name.'="'.$v.'"' ;
491 $s['o_xml_data'] = $data;
492 $s['o_xml_level'] = isset($s['o_xml_level']) ? $s['o_xml_level'] + 1 : 1;
493 if ($t == $s['p']) {/* xml container prop */
494 $s['p_xml_literal_level'] = isset($s['p_xml_literal_level']) ? $s['p_xml_literal_level'] + 1 : 1;
501 function h1Close($t) {/* end of doc */
507 function h2Close($t) {/* expecting a prop, getting a close */
508 if ($s = $this->getParentS()) {
509 $has_closing_tag = (isset($s['has_closing_tag']) && !$s['has_closing_tag']) ? 0 : 1;
512 if ($s = $this->getParentS()) {/* new s */
513 if (!isset($s['p']) || !$s['p']) {/* p close after collection|parseType=Resource|node close after p close */
514 $this->state = $this->s_count ? 4 : 1;
515 if (!$has_closing_tag) {
519 elseif (!$has_closing_tag) {
528 function h3Close($t) {/* p close */
534 function h4Close($t) {/* empty p | pClose after cdata | pClose after collection */
535 if ($s = $this->getParentS()) {
536 $b = isset($s['p_x_base']) && $s['p_x_base'] ? $s['p_x_base'] : (isset($s['x_base']) ? $s['x_base'] : '');
537 if (isset($s['is_coll']) && $s['is_coll']) {
538 $this->addT($s['value'], $this->rdf . 'rest', $this->rdf . 'nil', $s['type'], 'uri');
539 /* back to collection start */
540 while ((!isset($s['p']) || ($s['p'] != $t))) {
543 $s = $this->getParentS();
546 if (isset($s['p_id']) && $s['p_id']) {
547 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $sub_s['value'], $s['type'], $sub_s['type']);
553 $dt = isset($s['o_datatype']) ? $s['o_datatype'] : '';
554 $l = isset($s['p_x_lang']) && $s['p_x_lang'] ? $s['p_x_lang'] : (isset($s['x_lang']) ? $s['x_lang'] : '');
555 $o = array('type' => 'literal', 'value' => $s['o_cdata']);
556 $this->addT($s['value'], $s['p'], $o['value'], $s['type'], $o['type'], $dt, $l);
558 if (isset($s['p_id']) && $s['p_id']) {
559 $this->reify($this->calcURI('#'.$s['p_id'], $b), $s['value'], $s['p'], $o['value'], $s['type'], $o['type'], $dt, $l);
561 unset($s['o_cdata']);
562 unset($s['o_datatype']);
572 function h5Close($t) {/* p close */
573 if ($s = $this->getParentS()) {
582 function h6Close($t) {
583 if ($s = $this->getParentS()) {
584 $l = isset($s['p_x_lang']) && $s['p_x_lang'] ? $s['p_x_lang'] : (isset($s['x_lang']) ? $s['x_lang'] : '');
585 $data = $s['o_xml_data'];
586 $level = $s['o_xml_level'];
587 if ($level === 0) {/* pClose */
588 $this->addT($s['value'], $s['p'], trim($data, ' '), $s['type'], 'literal', $this->rdf.'XMLLiteral', $l);
589 unset($s['o_xml_data']);
593 $parts = $this->splitURI($t);
594 if (count($parts) == 1) {
595 $data .= '</'.$t.'>';
600 if (!isset($this->nsp[$ns_uri])) {
601 foreach ($this->nsp as $tmp1 => $tmp2) {
602 if (strpos($t, $tmp1) === 0) {
604 $name = substr($t, strlen($tmp1));
609 $nsp = $this->nsp[$ns_uri];
610 $data .= $nsp ? '</'.$nsp.':'.$name.'>' : '</'.$name.'>';
612 $s['o_xml_data'] = $data;
613 $s['o_xml_level'] = $level - 1;
614 if ($t == $s['p']) {/* xml container prop */
615 $s['p_xml_literal_level']--;
624 function h4Cdata($d) {
625 if ($s = $this->getParentS()) {
626 $s['o_cdata'] = isset($s['o_cdata']) ? $s['o_cdata'] . $d : $d;
633 function h6Cdata($d) {
634 if ($s = $this->getParentS()) {
635 if (isset($s['o_xml_data']) || preg_match("/[\n\r]/", $d) || trim($d)) {
636 $d = htmlspecialchars($d, ENT_NOQUOTES);
637 $s['o_xml_data'] = isset($s['o_xml_data']) ? $s['o_xml_data'] . $d : $d;