3 * FusionForge text sanitisation
5 * Copyright (C) 2005, Daniel Perez
6 * Copyright (C) 2008-2009 Alcatel-Lucent
8 * This file is part of FusionForge. FusionForge is free software;
9 * you can redistribute it and/or modify it under the terms of the
10 * GNU General Public License as published by the Free Software
11 * Foundation; either version 2 of the Licence, or (at your option)
14 * FusionForge is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with FusionForge; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * Standard Alcatel-Lucent disclaimer for contributing to open source
27 * "The Style Sheet ("Contribution") has not been tested and/or
28 * validated for release as or in products, combinations with products or
29 * other commercial use. Any use of the Contribution is entirely made at
30 * the user's own responsibility and the user can not rely on any features,
31 * functionalities or performances Alcatel-Lucent has attributed to the
34 * THE CONTRIBUTION BY ALCATEL-LUCENT IS PROVIDED AS IS, WITHOUT WARRANTY
35 * OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
36 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, COMPLIANCE,
37 * NON-INTERFERENCE AND/OR INTERWORKING WITH THE SOFTWARE TO WHICH THE
38 * CONTRIBUTION HAS BEEN MADE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL
39 * ALCATEL-LUCENT BE LIABLE FOR ANY DAMAGES OR OTHER LIABLITY, WHETHER IN
40 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
41 * CONTRIBUTION OR THE USE OR OTHER DEALINGS IN THE CONTRIBUTION, WHETHER
42 * TOGETHER WITH THE SOFTWARE TO WHICH THE CONTRIBUTION RELATES OR ON A STAND
46 require_once 'HTMLPurifier.auto.php';
48 class TextSanitizer extends Error {
51 * convertExtendedCharsForEmail - Grabs some text with html special characters and converts them to the corresponding character.
53 * @param string $text The input string
54 * @return string The output string
56 function convertExtendedCharsForEmail($text) {
57 $text = str_replace("´", "'", $text); //it's better to see that char in the email than the html entity
58 $text = str_replace("&", "&", $text);
59 $text = str_replace(""", '"', $text);
60 $text = str_replace("’", "’", $text);
61 $text = str_replace(" ", ' ', $text);
62 $text = str_replace("<", '<', $text);
63 $text = str_replace(">", '>', $text);
64 $text = str_replace("°", '°', $text);
65 $text = str_replace("²", '²', $text);
66 $text = str_replace("€", '€', $text);
67 $text = str_replace("¨", '¨', $text);
68 $text = str_replace("£", '£', $text);
69 $text = str_replace("¤", '¤', $text);
70 $text = str_replace("µ", 'µ', $text);
71 $text = str_replace("§", '§', $text);
72 $text = str_replace("œ", 'œ', $text);
73 $text = str_replace("<br>", "\n", $text);
74 $text = str_replace("<br />", "\n", $text);
76 $text = str_replace("é", "é", $text);
77 $text = str_replace("è", "è", $text);
78 $text = str_replace("ê", "ê", $text);
79 $text = str_replace("ë", "ë", $text);
80 $text = str_replace("à", "à", $text);
81 $text = str_replace("â", "â", $text);
82 $text = str_replace("ç", "ç", $text);
83 $text = str_replace("ù", "ù", $text);
84 $text = str_replace("û", "û", $text);
85 $text = str_replace("ü", "ü", $text);
86 $text = str_replace("ô", "ô", $text);
87 $text = str_replace("ï", "ï", $text);
93 * convertNeededTagsForEmail - Grabs some text with html tags and those which are important for display (<br>, <p>) convert accordingly
95 * @param string $text The input string
96 * @return string The output string
98 function convertNeededTagsForEmail($text) {
99 $text = str_replace("<br>", "\n", $text);
100 $text = str_replace("<br />", "\n", $text);
101 $text = str_replace("<br/>", "\n", $text);
102 $text = str_replace("<p>", "\n", $text);
103 $text = str_replace("</p>", "\n", $text);
104 $text = str_replace("<li>", "\n - ", $text);
105 $text = str_replace("</li>", '', $text);
106 $text = str_replace("<ul>", '', $text);
107 $text = str_replace("</ul>", "\n", $text);
108 $text = str_replace("\xc2\xa0", ' ', $text);
112 function unhtmlentities($string) {
113 $trans_tbl = get_html_translation_table(HTML_SPECIALCHARS);
114 $trans_tbl = array_flip($trans_tbl);
115 $res = strtr($string, $trans_tbl);
116 $res = str_replace("&quot;", '"', $res);
121 * SanitizeHtml - Grabs some text with all kinds of html code and parses it to make it safe
123 * @param string $input The HTML Code
124 * @return string The HTML output
126 function SanitizeHtml($input) {
128 $input = htmlspecialchars($input); // first strip all chars
130 $input = str_replace('&', '&', $input);
131 $input = str_replace('"', '"', $input);
132 $input = str_replace('/>', '/>', $input);
133 $input = str_replace('">', '">', $input);
134 $input = str_replace('</a>', '</a>', $input);
135 $input = str_replace('<strike>', '<strike>', $input);
136 $input = str_replace('</strike>', '</strike>', $input);
137 $input = str_replace('<sub>', '<sub>', $input);
138 $input = str_replace('</sub>', '</sub>', $input);
139 $input = str_replace('<span', '<span', $input);
140 $input = str_replace('</span>', '</span>', $input);
141 $input = str_replace('<font', '<font', $input);
142 $input = str_replace('</font>', '</font>', $input);
143 $input = str_replace('<hr>', '<hr>', $input);
144 $input = str_replace('<hr', '<hr', $input);
145 $input = str_replace('<br>', '<br>', $input);
146 $input = str_replace('<br />', '<br />', $input);
147 $input = str_replace('<tbody>', '<tbody>', $input);
148 $input = str_replace('</tbody>', '</tbody>', $input);
149 $input = str_replace('<tr>', '<tr>', $input);
150 $input = str_replace('</tr>', '</tr>', $input);
151 $input = str_replace('<td>', '<td>', $input);
152 $input = str_replace('</td>', '</td>', $input);
153 $input = str_replace('<td', '<td', $input);
154 $input = str_replace('<table>', '<table>', $input);
155 $input = str_replace('<table', '<table', $input);
156 $input = str_replace('</table>', '</table>', $input);
157 $input = str_replace('<div>', '<div>', $input);
158 $input = str_replace('<div', '<div', $input);
159 $input = str_replace('</div>', '</div>', $input);
160 $input = str_replace('<u>', '<u>', $input);
161 $input = str_replace('<u ', '<u ', $input); // rg
162 $input = str_replace('</u>', '</u>', $input);
163 $input = str_replace('<p>', '<p>', $input);
164 $input = str_replace('</p>', '</p>', $input);
165 $input = str_replace('<p ', '<p ', $input);
166 $input = str_replace('<li>', '<li>', $input);
167 $input = str_replace('</li>', '</li>', $input);
168 $input = str_replace('<ul>', '<ul>', $input);
169 $input = str_replace('</ul>', '</ul>', $input);
170 $input = str_replace('<ol>', '<ol>', $input);
171 $input = str_replace('</ol>', '</ol>', $input);
172 $input = str_replace('<blockquote>', '<blockquote>', $input);
173 $input = str_replace('<blockquote', '<blockquote', $input);
174 $input = str_replace('</blockquote>', '</blockquote>', $input);
175 $input = str_replace('<em>', '<em>', $input);
176 $input = str_replace('</em>', '</em>', $input);
177 $input = str_replace('<strong>', '<strong>', $input);
178 $input = str_replace('</strong>', '</strong>', $input);
179 $input = str_replace('<sup>', '<sup>', $input);
180 $input = str_replace('</sup>', '</sup>', $input);
181 $input = str_replace('<input ', '<input ', $input);
182 $input = str_replace('<img ', '<img ', $input);
183 $input = str_replace('<textarea ', '<textarea ', $input);
184 $input = str_replace('</textarea>', '</textarea>', $input);
185 $input = str_replace('<a ', '<a ', $input);
186 $input = str_replace('<h1>', '<h1>', $input);
187 $input = str_replace('</h1>', '</h1>', $input);
188 $input = str_replace('<h2>', '<h2>', $input);
189 $input = str_replace('</h2>', '</h2>', $input);
190 $input = str_replace('<h3>', '<h3>', $input);
191 $input = str_replace('</h3>', '</h3>', $input);
192 $input = str_replace('<h4>', '<h4>', $input);
193 $input = str_replace('</h4>', '</h4>', $input);
194 $input = str_replace('<h5>', '<h5>', $input);
195 $input = str_replace('</h5>', '</h5>', $input);
196 $input = str_replace('<h6>', '<h6>', $input);
197 $input = str_replace('</h6>', '</h6>', $input);
198 $input = str_replace('<pre>', '<pre>', $input);
199 $input = str_replace('</pre>', '</pre>', $input);
200 $input = str_replace('<address>', '<address>', $input);
201 $input = str_replace('</address>', '</address>', $input);
202 $input = str_replace('<h1 ', '<h1 ', $input);
203 $input = str_replace('<h2 ', '<h2 ', $input);
204 $input = str_replace('<h3 ', '<h3 ', $input);
205 $input = str_replace('<h4 ', '<h4 ', $input);
206 $input = str_replace('<h5 ', '<h5 ', $input);
207 $input = str_replace('<h6 ', '<h6 ', $input);
208 $input = str_replace('’', '\\\'', $input);
209 $input = str_replace('•', '-', $input);
211 // Allow embedding video like youtube ones.
212 $input = str_replace('<object ', '<object ', $input);
213 $input = str_replace('</object>', '</object>', $input);
214 $input = str_replace('<param ', '<param ', $input);
215 $input = str_replace('</param>', '</param>', $input);
216 $input = str_replace('<embed ', '<embed ', $input);
217 $input = str_replace('</embed>', '</embed>', $input);
222 function stripTags($text, $allowed = 'br,p,li,ul') {
223 // Try apc caching first (if possible).
224 if (function_exists('apc_fetch')) {
225 $key = 'stripTags.'.md5($text);
226 $cached = apc_fetch($key);
232 $config = HTMLPurifier_Config::createDefault();
233 $config->set('Cache.DefinitionImpl', NULL);
234 $config->set('HTML.Allowed', $allowed);
235 $purifier = new HTMLPurifier($config);
236 $text = $purifier->purify($text);
238 if (function_exists('apc_store') && $key) {
239 apc_store($key, $text, 3600);
244 static function purify($text) {
245 // Remove string like "<![if !supportLists]>" or "<![endif]>"
246 $text = preg_replace('/<!\[.+?\]>/', '', $text);
247 // Remove non opened tags at the beginning
250 $text = preg_replace('/^(<\/[^>]+>)/', '', $text, -1, $count);
252 // Remove non closed tags at the end
255 $text = preg_replace('/(<[^\/][^>]+>)$/', '', $text, -1, $count);
257 $config = HTMLPurifier_Config::createDefault();
258 //$config->set('HTML.Allowed','a[href|title],strike,sub,span,font,hr,br,tbody,tr,td,table,div,u,p,ul,li,ol,blockquote,em,strong,sup,input,img,textarea,h1,h2,h3,h4,h5,h6,pre,address');
259 $config->set('Cache.DefinitionImpl', NULL);
260 $purifier = new HTMLPurifier($config);
261 return $purifier->purify($text);
264 function summarize($text, $nb_line = 4, $truncate = true, $nb_char = 145) {
265 $text = $this->stripTags($text);
266 $text = $this->convertNeededTagsForEmail($text);
267 // Remove MS Windows extra char for CR
268 $text = preg_replace('/\r/', '', $text);
270 $text = preg_replace('/\n[\n\s]*/', "\n", $text);
272 $arr = explode("\n", $text);
273 $nb_max = count($arr);
274 if ($nb_max > $nb_line) $nb_max = $nb_line;
276 for ($l = 0; $l < $nb_max; $l++) {
277 $summary .= '<br />';
278 if ($truncate == true && $nb_max < $nb_line && $l == $nb_max - 1) {
279 $nb_char = $nb_char * ($nb_line - $nb_max + 1);
281 $summary .= util_make_links((($truncate == true && strlen($arr[$l]) > $nb_char)?
282 preg_replace('/[^\s]*$/', ' <b>...</b>', substr($arr[$l], 0, $nb_char), 1) :
292 // c-file-style: "bsd"