3 homepage: http://arc.semsol.org/
4 license: http://arc.semsol.org/license
6 class: ARC2 Atom Parser
7 author: Benjamin Nowack
8 version: 2009-04-21 (Addition: support for link types)
11 ARC2::inc('LegacyXMLParser');
13 class ARC2_AtomParser extends ARC2_LegacyXMLParser {
15 function __construct($a = '', &$caller) {
16 parent::__construct($a, $caller);
19 function ARC2_AtomParser($a = '', &$caller) {
20 $this->__construct($a, $caller);
23 function __init() {/* reader */
25 $this->triples = array();
26 $this->target_encoding = '';
28 $this->added_triples = array();
29 $this->skip_dupes = false;
30 $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a);
32 $this->cache = array();
33 $this->allowCDataNodes = 0;
44 function setReader(&$reader) {
45 $this->reader =& $reader;
48 function createBnodeID(){
50 return '_:' . $this->bnode_prefix . $this->bnode_id;
54 //if (!isset($t['o_datatype']))
55 if ($this->skip_dupes) {
56 //$h = md5(print_r($t, 1));
57 $h = md5(serialize($t));
58 if (!isset($this->added_triples[$h])) {
59 $this->triples[$this->t_count] = $t;
61 $this->added_triples[$h] = true;
65 $this->triples[$this->t_count] = $t;
70 function getTriples() {
71 return $this->v('triples', array());
74 function countTriples() {
75 return $this->t_count;
78 function getSimpleIndex($flatten_objects = 1, $vals = '') {
79 return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals);
84 function extractRDF() {
85 $index = $this->getNodeIndex();
87 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
88 $this->atom = 'http://www.w3.org/2005/Atom';
89 $this->rss = 'http://purl.org/rss/1.0/';
90 $this->dc = 'http://purl.org/dc/elements/1.1/';
91 $this->sioc = 'http://rdfs.org/sioc/ns#';
92 $this->dct = 'http://purl.org/dc/terms/';
93 $this->content = 'http://purl.org/rss/1.0/modules/content/';
94 $this->enc = 'http://purl.oclc.org/net/rss_2.0/enc#';
95 $this->mappings = array(
96 'feed' => $this->rss . 'channel',
97 'entry' => $this->rss . 'item',
98 'title' => $this->rss . 'title',
99 'link' => $this->rss . 'link',
100 'summary' => $this->rss . 'description',
101 'content' => $this->content . 'encoded',
102 'id' => $this->dc . 'identifier',
103 'author' => $this->dc . 'creator',
104 'category' => $this->dc . 'subject',
105 'updated' => $this->dc . 'date',
106 'source' => $this->dc . 'source',
108 $this->dt_props = array(
109 $this->dc . 'identifier',
112 foreach ($index as $p_id => $nodes) {
113 foreach ($nodes as $pos => $node) {
114 $tag = $this->v('tag', '', $node);
115 if ($tag == 'feed') {
116 $struct = $this->extractChannel($index[$node['id']]);
117 $triples = ARC2::getTriplesFromIndex($struct);
118 foreach ($triples as $t) {
122 elseif ($tag == 'entry') {
123 $struct = $this->extractItem($index[$node['id']]);
124 $triples = ARC2::getTriplesFromIndex($struct);
125 foreach ($triples as $t) {
133 function extractChannel($els) {
134 list($props, $sub_index) = $this->extractProps($els, 'channel');
135 $uri = $props[$this->rss . 'link'][0]['value'];
136 return ARC2::getMergedIndex(array($uri => $props), $sub_index);
139 function extractItem($els) {
140 list($props, $sub_index) = $this->extractProps($els, 'item');
141 $uri = $props[$this->rss . 'link'][0]['value'];
142 return ARC2::getMergedIndex(array($uri => $props), $sub_index);
145 function extractProps($els, $container) {
146 $r = array($this->rdf . 'type' => array(array('value' => $this->rss . $container, 'type' => 'uri')));
147 $sub_index = array();
148 foreach ($els as $info) {
151 if (!preg_match('/^[a-z0-9]+\:/i', $tag)) {
152 $k = isset($this->mappings[$tag]) ? $this->mappings[$tag] : '';
154 elseif (isset($this->mappings[$tag])) {
155 $k = $this->mappings[$tag];
158 $k = $this->expandPName($tag);
161 if (($container == 'channel') && ($k == $this->rss . 'item')) continue;
163 $v = trim($info['cdata']);
164 if (!$v) $v = $this->v('href uri', '', $info['a']);
167 /* content handling */
168 if (in_array($k, array($this->rss . 'description', $this->content . 'encoded'))) {
169 $v = $this->getNodeContent($info);
171 /* source handling */
172 elseif ($k == $this->dc . 'source') {
173 $sub_nodes = $this->node_index[$info['id']];
174 foreach ($sub_nodes as $sub_pos => $sub_info) {
175 if ($sub_info['tag'] == 'id') {
176 $v = trim($sub_info['cdata']);
181 elseif ($k == $this->rss . 'link') {
182 if ($link_type = $this->v('type', '', $info['a'])) {
183 $k2 = $this->dc . 'format';
184 if (!isset($sub_index[$v])) $sub_index[$v] = array();
185 if (!isset($sub_index[$v][$k2])) $sub_index[$v][$k2] = array();
186 $sub_index[$v][$k2][] = array('value' => $link_type, 'type' => 'literal');
189 /* author handling */
190 elseif ($k == $this->dc . 'creator') {
191 $sub_nodes = $this->node_index[$info['id']];
192 foreach ($sub_nodes as $sub_pos => $sub_info) {
193 if ($sub_info['tag'] == 'name') {
194 $v = trim($sub_info['cdata']);
196 if ($sub_info['tag'] == 'uri') {
197 $k2 = $this->sioc . 'has_creator';
198 $v2 = trim($sub_info['cdata']);
199 if (!isset($r[$k2])) $r[$k2] = array();
200 $r[$k2][] = array('value' => $v2, 'type' => 'uri');
205 elseif (in_array($k, array($this->dc . 'date', $this->dct . 'modified'))) {
206 if (!preg_match('/^[0-9]{4}/', $v) && ($sub_v = strtotime($v)) && ($sub_v != -1)) {
207 $tz = date('Z', $sub_v); /* timezone offset */
208 $sub_v -= $tz; /* utc */
209 $v = date('Y-m-d\TH:i:s\Z', $sub_v);
213 elseif ($k == $this->dc . 'subject') {
214 $v = $this->v('term', '', $info['a']);
216 /* other attributes in closed tags */
217 elseif (!$v && ($info['state'] == 'closed') && $info['a']) {
218 foreach ($info['a'] as $sub_k => $sub_v) {
219 if (!preg_match('/(xmlns|\:|type)/', $sub_k)) {
225 if (!isset($r[$k])) $r[$k] = array();
226 $r[$k][] = array('value' => $v, 'type' => in_array($k, $this->dt_props) || !preg_match('/^[a-z0-9]+\:[^\s]+$/is', $v) ? 'literal' : 'uri');
229 return array($r, $sub_index);
232 function initXMLParser() {
233 if (!isset($this->xml_parser)) {
234 $enc = preg_match('/^(utf\-8|iso\-8859\-1|us\-ascii)$/i', $this->getEncoding(), $m) ? $m[1] : 'UTF-8';
235 $parser = xml_parser_create($enc);
236 xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 0);
237 xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
238 xml_set_element_handler($parser, 'open', 'close');
239 xml_set_character_data_handler($parser, 'cData');
240 xml_set_start_namespace_decl_handler($parser, 'nsDecl');
241 xml_set_object($parser, $this);
242 $this->xml_parser =& $parser;