3 homepage: http://arc.semsol.org/
4 license: http://arc.semsol.org/license
6 class: ARC2 poshRDF Extractor
7 author: Benjamin Nowack
11 ARC2::inc('ARC2_RDFExtractor');
13 class ARC2_PoshRdfExtractor extends ARC2_RDFExtractor {
15 function __construct($a = '', &$caller) {
16 parent::__construct($a, $caller);
19 function ARC2_PoshRdfExtractor($a = '', &$caller) {
20 $this->__construct($a, $caller);
25 $this->terms = $this->v('posh_terms', array(), $this->a);
26 $this->ns_prefix = 'posh';
27 $this->a['ns'] += array(
28 'an' => 'http://www.w3.org/2000/10/annotation-ns#',
29 'content' => 'http://purl.org/rss/1.0/modules/content/',
30 'dc' => 'http://purl.org/dc/elements/1.1/',
31 'dct' => 'http://purl.org/dc/terms/',
32 'foaf' => 'http://xmlns.com/foaf/0.1/',
33 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#',
34 'ical' => 'http://www.w3.org/2002/12/cal/icaltzd#',
35 'owl' => 'http://www.w3.org/2002/07/owl#',
36 'posh' => 'http://poshrdf.org/ns/posh/',
37 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
38 'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
39 'rev' => 'http://www.purl.org/stuff/rev#',
40 'rss' => 'http://purl.org/rss/1.0/',
41 'sioc' => 'http://rdfs.org/sioc/ns#',
42 'skos' => 'http://www.w3.org/2008/05/skos#',
43 'uri' => 'http://www.w3.org/2006/uri#',
44 'vcard' => 'http://www.w3.org/2006/vcard/ns#',
45 'xfn' => 'http://gmpg.org/xfn/11#',
46 'xml' => 'http://www.w3.org/XML/1998/namespace',
47 'xsd' => 'http://www.w3.org/2001/XMLSchema#',
53 function extractRDF() {
54 if (!isset($this->caller->detected_formats['posh-rdf'])) return 0;
55 $n = $this->getRootNode();
56 $base = $this->getDocBase();
61 's' => array(array('_doc', $base)),
62 'next_s' => array('_doc', $base),
64 'ns' => $this->a['ns'],
68 $ct = $this->processNode($n, $context, 0, 1);
73 function getRootNode() {
74 foreach ($this->nodes as $id => $node) {
75 if ($node['tag'] == 'html') {
79 return $this->nodes[0];
84 function processNode($n, $ct, $level, $pos) {
85 $n = $this->preProcessNode($n);
87 $lct = array_merge($ct, array(
88 'ns' => array_merge($ct['ns'], $this->v('xmlns', array(), $n['a'])),
89 'rpointer' => isset($n['a']['id']) ? $n['a']['id'] : ($n['tag'] == 'cdata' ? '' : $ct['rpointer'] . '/' . $pos),
92 'lang' => $this->v('xml:lang', $ct['lang'], $n['a']),
95 $next_s_key = $lct['next_s'][0];
96 $next_s_val = $lct['next_s'][1];
97 if ($lct['s'][0][0] != $next_s_key) {
98 $lct['s'] = array_merge(array($lct['next_s']), $lct['s']);
101 $lct['s'][0][1] = $next_s_val;
104 if ($this->hasClass($n, 'rdf-s')) {
105 $lct['next_s'] = array($n['a']['class'], $this->getSubject($n, $lct));
106 //echo "\ns: " . print_r($lct['next_s'], 1);
109 if ($this->hasClass($n, 'rdf-p') || $this->hasRel($n, 'rdf-p')) {
110 if ($ps = $this->getPredicates($n, $lct['ns'])) {
112 $this->addPoshTypes($lct);
116 $cls = $this->v('class', '', $n['a']);
117 if ($lct['ps'] && preg_match('/(^|\s)rdf\-(o|o\-(xml|dateTime|float|integer|boolean))($|\s)/s', $cls, $m)) {
118 $this->addTriples($n, $lct, $m[3]);
121 if ($sub_nodes = $this->getSubNodes($n)) {
124 foreach ($sub_nodes as $i => $sub_node) {
125 if (in_array($sub_node['tag'], array('cdata', 'comment'))) continue;
126 $sub_ct = $this->processNode($sub_node, $cur_ct, $level + 1, $sub_pos);
128 $cur_ct['next_s'] = $sub_ct['next_s'];
129 $cur_ct['ps'] = $sub_ct['ps'];
137 function getSubject($n, $ct) {
138 foreach (array('href uri', 'src uri', 'title', 'value') as $k) {
139 if (isset($n['a'][$k])) return $n['a'][$k];
142 return $ct['base'] . '#resource(' . $ct['rpointer'] . ')';
145 function getPredicates($n, $ns) {
148 $vals = array_merge($this->v('class m', array(), $n['a']), $this->v('rel m', array(), $n['a']));
149 foreach ($vals as $val) {
150 if (!preg_match('/^([a-z0-9]+)\-([a-z0-9\-\_]+)$/i', $val, $m)) continue;
151 if (!isset($ns[$m[1]])) continue;
152 if (preg_match('/^rdf-(s|p|o|o-(xml|dateTime|float|integer|boolean))$/', $val)) continue;
153 $r[] = $ns[$m[1]] . $m[2];
155 /* try other attributes */
157 foreach (array('href uri', 'title') as $k) {
158 if (isset($n['a'][$k])) {
167 function addTriples($n, $ct, $o_type) {
168 foreach (array('href uri', 'src uri', 'title', 'value') as $k) {
169 if (isset($n['a'][$k])) {
170 $node_o = $n['a'][$k];
174 if (!isset($node_o) && $this->hasClass($n, 'rdf-s')) {
175 $node_o = $ct['next_s'][1];
177 $lit_o = ($o_type == 'xml') ? $this->getContent($n) : $this->getPlainContent($n);
178 $posh_ns = $ct['ns'][$this->ns_prefix];
179 $rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
180 $xsd = 'http://www.w3.org/2001/XMLSchema#';
181 foreach ($ct['ps'] as $p) {
182 $p_key = str_replace($posh_ns, '', $p);
184 $o = $this->isDatatypeProperty($p_key) ? $lit_o : (isset($node_o) ? $node_o : $lit_o);
186 if (!$s = $this->getContainerSubject($ct, $p_key)) continue;
187 $lang = (($o == $lit_o) && !$o_type) ? $ct['lang'] : '';
188 $o = $this->tweakObject($o, $p, $ct);
190 's' => $this->getContainerSubject($ct, $p_key),
191 's_type' => preg_match('/^\_\:/', $s) ? 'bnode' : 'uri',
194 'o_type' => $this->getObjectType($o, $p_key),
196 'o_datatype' => ($o_type == 'xml') ? $rdf . 'XMLLiteral' : ($o_type ? $xsd . $o_type : ''),
201 function addPoshTypes($ct) {
202 $posh_ns = $ct['ns'][$this->ns_prefix];
203 foreach ($ct['ps'] as $p) {
204 $p_key = str_replace($posh_ns, '', $p);
205 if (!$this->isSubject($p_key)) continue;
206 $s = $ct['next_s'][1];
209 's_type' => preg_match('/^\_\:/', $s) ? 'bnode' : 'uri',
210 'p' => $ct['ns']['rdf'] . 'type',
211 'o' => $posh_ns . ucfirst($p_key),
221 function preProcessNode($n) {
225 function getContainerSubject($ct, $term) {
226 if (!isset($this->terms[$term])) return $ct['s'][0][1];
227 $scope = $this->v('scope', array(), $this->terms[$term]);
228 if (!$scope) return $ct['s'][0][1];
229 $scope_re = join('|', $scope);
230 foreach ($ct['s'] as $s) {
231 if (preg_match('/(^|\s)(' . $scope_re. ')($|\s)/s', str_replace($this->ns_prefix . '-', '', $s[0]))) return $s[1];
236 function isSubject($term) {
237 if (!isset($this->terms[$term])) return 0;
238 return in_array('s', $this->terms[$term]);
241 function isDatatypeProperty($term) {
242 if (!isset($this->terms[$term])) return 0;
243 return in_array('plain', $this->terms[$term]);
246 function getObjectType($o, $term) {
247 if ($this->isDatatypeProperty($term)) return 'literal';
248 if (strpos($o, ' ')) return 'literal';
249 return preg_match('/^([a-z0-9\_]+)\:[^\s]+$/s', $o, $m) ? ($m[1] == '_' ? 'bnode' : 'uri') : 'literal';
252 function tweakObject($o, $p, $ct) {