3 homepage: http://arc.semsol.org/
4 license: http://arc.semsol.org/license
6 class: ARC2 RDFa Extractor
7 author: Benjamin Nowack
8 version: 2009-05-29 (Fix: CURIEs support DOTs now)
11 ARC2::inc('RDFExtractor');
13 class ARC2_RdfaExtractor extends ARC2_RDFExtractor {
15 function __construct($a = '', &$caller) {
16 parent::__construct($a, $caller);
19 function ARC2_RdfaExtractor($a = '', &$caller) {
20 $this->__construct($a, $caller);
29 function extractRDF() {
30 //echo '<pre>' . htmlspecialchars(print_r($this->nodes, 1)) . '</pre>';
31 if (!isset($this->caller->detected_formats['rdfa'])) return 0;
32 $root_node = $this->getRootNode();
33 //$base = $this->v('xml:base', $this->getDocBase(), $root_node['a']);
34 $base = $this->getDocBase();
43 $this->processNode($root_node, $context, 0);
48 function getRootNode() {
49 foreach ($this->nodes as $id => $node) {
50 if ($node['tag'] == 'html') {
54 return $this->nodes[0];
59 function processNode($n, $ct, $level) {
60 if ($n['tag']=='cdata' || $n['tag']=='comment') return null; /* patch by tobyink */
64 $lct['prev_s'] = $this->v('prev_s', $this->v('p_s', '', $ct), $ct);
68 $lct['cur_o_res'] = '';
69 $lct['inco_ts'] = array();
70 $lct['base'] = $ct['base'];
71 //$lct['base'] = $this->v('xml:base', $ct['base'], $n['a']);
73 $lct['ns'] = array_merge($ct['ns'], $this->v('xmlns', array(), $n['a']));
75 $lct['lang'] = $this->v('xml:lang', $ct['lang'], $n['a']);
77 $rel_uris = $this->getAttributeURIs($n, $ct, $lct, 'rel');
78 $rev_uris = $this->getAttributeURIs($n, $ct, $lct, 'rev');
79 if (!$rel_uris && !$rev_uris) {
80 foreach (array('about', 'src', 'resource', 'href') as $attr) {
81 if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) {
87 if (preg_match('/(head|body)/i', $n['tag'])) {
88 $lct['new_s'] = $lct['base'];
90 elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
91 $lct['new_s'] = $this->createBnodeID();
94 $lct['new_s'] = $ct['p_o'];
96 if(!isset($n['a']['property'])) $lct['skip'] = 1;/* patch by masaka */
102 foreach (array('about', 'src') as $attr) {
103 if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) {
104 $lct['new_s'] = $uri;
108 if (!$lct['new_s']) {
109 if (preg_match('/(head|body)/i', $n['tag'])) {
110 $lct['new_s'] = $lct['base'];
112 elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
113 $lct['new_s'] = $this->createBnodeID();
115 elseif ($ct['p_o']) {
116 $lct['new_s'] = $ct['p_o'];
119 foreach (array('resource', 'href') as $attr) {
120 if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) {
121 $lct['cur_o_res'] = $uri;
128 if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
129 foreach ($uris as $uri) {
131 's' => $lct['new_s'],
132 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
133 'p' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
143 if ($lct['cur_o_res']) {
145 foreach ($rel_uris as $uri) {
147 's' => $lct['new_s'],
148 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
150 'o' => $lct['cur_o_res'],
151 'o_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri',
159 foreach ($rev_uris as $uri) {
161 's' => $lct['cur_o_res'],
162 's_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri',
164 'o' => $lct['new_s'],
165 'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
175 if (!$lct['cur_o_res']) {
176 if ($rel_uris || $rev_uris) {
177 $lct['cur_o_res'] = $this->createBnodeID();
178 foreach ($rel_uris as $uri) {
179 $lct['inco_ts'][] = array('p' => $uri, 'dir' => 'fwd');
181 foreach ($rev_uris as $uri) {
182 $lct['inco_ts'][] = array('p' => $uri, 'dir' => 'rev');
187 if (!$lct['skip'] && ($new_s = $lct['new_s'])) {
188 //if ($new_s = $lct['new_s']) {
189 if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'property')) {
190 foreach ($uris as $uri) {
191 $lct['cur_o_lit'] = $this->getCurrentObjectLiteral($n, $lct, $ct);
193 's' => $lct['new_s'],
194 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
196 'o' => $lct['cur_o_lit']['value'],
197 'o_type' => 'literal',
198 'o_lang' => $lct['cur_o_lit']['lang'],
199 'o_datatype' => $lct['cur_o_lit']['datatype'],
202 if ($lct['cur_o_lit']['datatype'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral') {
209 $complete_triples = 0;
210 if ($lct['recurse']) {
212 $new_ct = array_merge($ct, array('base' => $lct['base'], 'lang' => $lct['lang'], 'ns' => $lct['ns']));
216 'base' => $lct['base'],
217 'p_s' => $lct['new_s'] ? $lct['new_s'] : $ct['p_s'],
218 'p_o' => $lct['cur_o_res'] ? $lct['cur_o_res'] : ($lct['new_s'] ? $lct['new_s'] : $ct['p_s']),
220 'inco_ts' => $lct['inco_ts'],
221 'lang' => $lct['lang']
224 $sub_nodes = $this->getSubNodes($n);
225 foreach ($sub_nodes as $sub_node) {
226 if ($this->processNode($sub_node, $new_ct, $level+1)) {
227 $complete_triples = 1;
233 if ($ts_added || $complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) || ($other == 1)) {
234 //if (!$lct['skip'] && ($complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])))) {
235 foreach ($ct['inco_ts'] as $inco_t) {
236 if ($inco_t['dir'] == 'fwd') {
239 's_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri',
241 'o' => $lct['new_s'],
242 'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
247 elseif ($inco_t['dir'] == 'rev') {
249 's' => $lct['new_s'],
250 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
253 'o_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri',
260 /* step 13 (12) (result flag) */
261 if ($ts_added) return 1;
262 if ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) return 1;
263 if ($complete_triples) return 1;
269 function getAttributeURIs($n, $ct, $lct, $attr) {
270 $vals = ($val = $this->v($attr, '', $n['a'])) ? explode(' ', $val) : array();
272 foreach ($vals as $val) {
273 if(!trim($val)) continue;
274 if ((list($uri, $sub_v) = $this->xURI(trim($val), $lct['base'], $lct['ns'], $attr, $lct)) && $uri) {
283 function getCurrentObjectLiteral($n, $lct, $ct) {
284 $xml_val = $this->getContent($n);
285 $plain_val = $this->getPlainContent($n, 0, 0);
286 if (function_exists('html_entity_decode')) {
287 $plain_val = html_entity_decode($plain_val, ENT_QUOTES);
289 $dt = $this->v('datatype', '', $n['a']);
290 list($dt_uri, $sub_v) = $this->xURI($dt, $lct['base'], $lct['ns'], '', $lct);
291 $dt = $dt ? $dt_uri : $dt;
292 $r = array('value' => '', 'lang' => $lct['lang'], 'datatype' => $dt);
293 if (isset($n['a']['content'])) {
294 $r['value'] = $n['a']['content'];
295 if (function_exists('html_entity_decode')) {
296 $r['value'] = html_entity_decode($r['value'], ENT_QUOTES);
299 elseif ($xml_val == $plain_val) {
300 $r['value'] = $plain_val;
302 elseif (!preg_match('/[\<\>]/', $xml_val)) {
303 $r['value'] = $xml_val;
305 elseif (isset($n['a']['datatype']) && ($dt != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) {
306 $r['value'] = $plain_val;
308 elseif (!isset($n['a']['datatype']) || ($dt == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) {
309 $r['value'] = $this->injectXMLDeclarations($xml_val, $lct['ns'], $lct['lang']);
310 $r['datatype'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral';
315 function injectXMLDeclarations($val, $ns, $lang) {//@@todo proper node rebuilding */
316 $lang_code = $lang ? ' xml:lang="' . $lang . '"' : '';
318 $val = preg_replace('/<([a-z0-9]+)([\>\s])/is', '<\\1 xmlns="http://www.w3.org/1999/xhtml"' . $lang_code . '\\2', $val);
319 foreach ($ns as $prefix => $uri) {
320 if ($prefix && ($pos = strpos(' ' . $val, '<' . $prefix . ':'))) {
321 $val = substr($val, 0, $pos - 1) . preg_replace('/^(<' . $prefix . '\:[^\>\s]+)/', '\\1 xmlns:' . $prefix. '="' . $uri . '"' . $lang_code, substr($val, $pos - 1));
324 /* remove accidentally added xml:lang and xmlns= */
325 $val = preg_replace('/(\<[^\>]*)( xml\:lang[^\s\>]+)([^\>]*)(xml\:lang[^\s\>]+)/s', '\\1\\3\\4', $val);
326 $val = preg_replace('/(\<[^\>]*)( xmlns=[^\s\>]+)([^\>]*)(xmlns=[^\s\>]+)/s', '\\1\\3\\4', $val);
332 function xURI($v, $base, $ns, $attr_type = '', $lct = '') {
333 if ((list($sub_r, $sub_v) = $this->xBlankCURIE($v, $base, $ns)) && $sub_r) {
334 return array($sub_r, $sub_v);
336 if ((list($sub_r, $sub_v) = $this->xSafeCURIE($v, $base, $ns, $lct)) && $sub_r) {
337 return array($sub_r, $sub_v);
339 if ((list($sub_r, $sub_v) = $this->xCURIE($v, $base, $ns)) && $sub_r) {
340 return array($sub_r, $sub_v);
342 if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^\s*(alternate|appendix|bookmark|cite|chapter|contents|copyright|glossary|help|icon|index|last|license|meta|next|p3pv1|prev|role|section|stylesheet|subsection|start|up)(\s|$)/is', $v, $m)) {
343 return array('http://www.w3.org/1999/xhtml/vocab#' . strtolower($m[1]), preg_replace('/^\s*' . $m[1]. '/is', '', $v));
345 if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^[a-z0-9\.]+$/i', $v)) {
348 return array($this->calcURI($v, $base), '');
351 function xBlankCURIE($v, $base, $ns) {
352 if ($sub_r = $this->x('\[\_\:\]', $v)) {
353 $this->empty_bnode = isset($this->empty_bnode) ? $this->empty_bnode : $this->createBnodeID();
354 return array($this->empty_bnode, '');
356 if ($sub_r = $this->x('\[?(\_\:[a-z0-9\_\-]+)\]?', $v)) {
357 return array($sub_r[1], '');
362 function xSafeCURIE($v, $base, $ns, $lct = '') {
364 if ($sub_r = $this->x('\[\]', $v)) {
365 $r = $lct ? $lct['prev_s'] : $base;/* should be current subject value */
366 return $sub_r[1] ? array($r, $sub_r[1]) : array($r, '');
368 if ($sub_r = $this->x('\[([^\:]*)\:([^\]]*)\]', $v)) {
369 if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], '');
370 if (isset($ns[$sub_r[1]])) {
371 return array($ns[$sub_r[1]] . $sub_r[2], '');
377 function xCURIE($v, $base, $ns) {
378 if ($sub_r = $this->x('([a-z0-9\-\_]*)\:([^\s]+)', $v)) {
379 if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], '');
380 if (isset($ns[$sub_r[1]])) {
381 return array($ns[$sub_r[1]] . $sub_r[2], '');