3 homepage: http://arc.semsol.org/
4 license: http://arc.semsol.org/license
7 author: Benjamin Nowack
11 ARC2::inc('LegacyXMLParser');
13 class ARC2_RSSParser extends ARC2_LegacyXMLParser {
15 function __construct($a = '', &$caller) {
16 parent::__construct($a, $caller);
19 function ARC2_RSSParser($a = '', &$caller) {
20 $this->__construct($a, $caller);
23 function __init() {/* reader */
25 $this->triples = array();
26 $this->target_encoding = '';
28 $this->added_triples = array();
29 $this->skip_dupes = false;
30 $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a);
32 $this->cache = array();
33 $this->allowCDataNodes = 0;
44 function setReader(&$reader) {
45 $this->reader =& $reader;
48 function createBnodeID(){
50 return '_:' . $this->bnode_prefix . $this->bnode_id;
54 //if (!isset($t['o_datatype']))
55 if ($this->skip_dupes) {
56 $h = md5(serialize($t));
57 if (!isset($this->added_triples[$h])) {
58 $this->triples[$this->t_count] = $t;
60 $this->added_triples[$h] = true;
64 $this->triples[$this->t_count] = $t;
69 function getTriples() {
70 return $this->v('triples', array());
73 function countTriples() {
74 return $this->t_count;
77 function getSimpleIndex($flatten_objects = 1, $vals = '') {
78 return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals);
83 function extractRDF() {
84 $index = $this->getNodeIndex();
85 $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
86 $this->rss = 'http://purl.org/rss/1.0/';
87 $this->dc = 'http://purl.org/dc/elements/1.1/';
88 $this->dct = 'http://purl.org/dc/terms/';
89 $this->content = 'http://purl.org/rss/1.0/modules/content/';
90 $this->enc = 'http://purl.oclc.org/net/rss_2.0/enc#';
91 $this->mappings = array(
92 'channel' => $this->rss . 'channel',
93 'item' => $this->rss . 'item',
94 'title' => $this->rss . 'title',
95 'link' => $this->rss . 'link',
96 'description' => $this->rss . 'description',
97 'guid' => $this->dc . 'identifier',
98 'author' => $this->dc . 'creator',
99 'category' => $this->dc . 'subject',
100 'pubDate' => $this->dc . 'date',
101 'pubdate' => $this->dc . 'date',
102 'source' => $this->dc . 'source',
103 'enclosure' => $this->enc . 'enclosure',
105 $this->dt_props = array(
106 $this->dc . 'identifier',
109 foreach ($index as $p_id => $nodes) {
110 foreach ($nodes as $pos => $node) {
111 $tag = $this->v('tag', '', $node);
112 if ($tag == 'channel') {
113 $struct = $this->extractChannel($index[$node['id']]);
114 $triples = ARC2::getTriplesFromIndex($struct);
115 foreach ($triples as $t) {
119 elseif ($tag == 'item') {
120 $struct = $this->extractItem($index[$node['id']]);
121 $triples = ARC2::getTriplesFromIndex($struct);
122 foreach ($triples as $t) {
130 function extractChannel($els) {
131 $res = array($this->rdf . 'type' => array(array('value' => $this->rss . 'channel', 'type' => 'uri')));
132 $res = array_merge($res, $this->extractProps($els, 'channel'));
133 return array($res[$this->rss . 'link'][0]['value'] => $res);
136 function extractItem($els) {
137 $res = array($this->rdf . 'type' => array(array('value' => $this->rss . 'item', 'type' => 'uri')));
138 $res = array_merge($res, $this->extractProps($els, 'item'));
139 if (isset($res[$this->rss . 'link'])) return array($res[$this->rss . 'link'][0]['value'] => $res);
140 if (isset($res[$this->dc . 'identifier'])) return array($res[$this->dc . 'identifier'][0]['value'] => $res);
143 function extractProps($els, $container) {
145 foreach ($els as $info) {
148 if (!preg_match('/^[a-z0-9]+\:/i', $tag)) {
149 $k = isset($this->mappings[$tag]) ? $this->mappings[$tag] : '';
154 if (($container == 'channel') && ($k == $this->rss . 'item')) continue;
157 if (!$v) $v = $this->v('url', '', $info['a']);
158 if (!$v) $v = $this->v('href', '', $info['a']);
161 /* enclosure handling */
162 if ($k == $this->enc . 'enclosure') {
164 foreach (array('length', 'type') as $attr) {
165 if ($attr_v = $this->v($attr, 0, $info['a'])) {
166 $sub_res[$this->enc . $attr] = array(array('value' => $attr_v, 'type' => 'literal'));
169 $struct[$v] = $sub_res;
172 if (in_array($k, array($this->dc . 'date', $this->dct . 'modified'))) {
173 if (!preg_match('/^[0-9]{4}/', $v) && ($sub_v = strtotime($v)) && ($sub_v != -1)) {
174 $tz = date('Z', $sub_v); /* timezone offset */
175 $sub_v -= $tz; /* utc */
176 $v = date('Y-m-d\TH:i:s\Z', $sub_v);
179 if (!isset($res[$k])) $res[$k] = array();
180 $res[$k][] = array('value' => $v, 'type' => in_array($k, $this->dt_props) || !preg_match('/^[a-z0-9]+\:[^\s]+$/is', $v) ? 'literal' : 'uri');