3 * FusionForge text sanitisation
5 * Copyright (C) 2005, Daniel Perez
6 * Copyright (C) 2008-2009 Alcatel-Lucent
8 * This file is part of FusionForge. FusionForge is free software;
9 * you can redistribute it and/or modify it under the terms of the
10 * GNU General Public License as published by the Free Software
11 * Foundation; either version 2 of the Licence, or (at your option)
14 * FusionForge is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with FusionForge; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * Standard Alcatel-Lucent disclaimer for contributing to open source
27 * "The Style Sheet ("Contribution") has not been tested and/or
28 * validated for release as or in products, combinations with products or
29 * other commercial use. Any use of the Contribution is entirely made at
30 * the user's own responsibility and the user can not rely on any features,
31 * functionalities or performances Alcatel-Lucent has attributed to the
34 * THE CONTRIBUTION BY ALCATEL-LUCENT IS PROVIDED AS IS, WITHOUT WARRANTY
35 * OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
36 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, COMPLIANCE,
37 * NON-INTERFERENCE AND/OR INTERWORKING WITH THE SOFTWARE TO WHICH THE
38 * CONTRIBUTION HAS BEEN MADE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL
39 * ALCATEL-LUCENT BE LIABLE FOR ANY DAMAGES OR OTHER LIABLITY, WHETHER IN
40 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
41 * CONTRIBUTION OR THE USE OR OTHER DEALINGS IN THE CONTRIBUTION, WHETHER
42 * TOGETHER WITH THE SOFTWARE TO WHICH THE CONTRIBUTION RELATES OR ON A STAND
46 require_once('HTMLPurifier.auto.php');
48 Class TextSanitizer extends Error {
52 * convertExtendedCharsForEmail - Grabs some text with html special characters and converts them to the corresponding character.
54 * @param string The input string
55 * @return string The output string
57 function convertExtendedCharsForEmail($text) {
58 $text = str_replace("´","'",$text); //it's better to see that char in the email than the html entity
59 $text = str_replace("&","&",$text);
60 $text = str_replace(""",'"',$text);
61 $text = str_replace("’","’",$text);
62 $text = str_replace(" ",' ',$text);
63 $text = str_replace("<",'<',$text);
64 $text = str_replace(">",'>',$text);
65 $text = str_replace("°",'°',$text);
66 $text = str_replace("²",'²',$text);
67 $text = str_replace("€",'€',$text);
68 $text = str_replace("¨",'¨',$text);
69 $text = str_replace("£",'£',$text);
70 $text = str_replace("¤",'¤',$text);
71 $text = str_replace("µ",'µ',$text);
72 $text = str_replace("§",'§',$text);
73 $text = str_replace("œ",'œ',$text);
74 $text = str_replace("<br>","\n",$text);
75 $text = str_replace("<br />","\n",$text);
77 $text = str_replace("é","é",$text);
78 $text = str_replace("è","è",$text);
79 $text = str_replace("ê","ê",$text);
80 $text = str_replace("ë","ë",$text);
81 $text = str_replace("à","à",$text);
82 $text = str_replace("â","â",$text);
83 $text = str_replace("ç","ç",$text);
84 $text = str_replace("ù","ù",$text);
85 $text = str_replace("û","û",$text);
86 $text = str_replace("ü","ü",$text);
87 $text = str_replace("ô","ô",$text);
88 $text = str_replace("ï","ï",$text);
94 * convertNeededTagsForEmail - Grabs some text with html tags and those which are important for display (<br>, <p>) convert accordingly
96 * @param string The input string
97 * @return string The output string
99 function convertNeededTagsForEmail($text) {
100 $text = str_replace("<br>","\n",$text);
101 $text = str_replace("<br />","\n",$text);
102 $text = str_replace("<br/>","\n",$text);
103 $text = str_replace("<p>","\n",$text);
104 $text = str_replace("</p>","\n",$text);
105 $text = str_replace("<li>","\n - ",$text);
106 $text = str_replace("</li>",'',$text);
107 $text = str_replace("<ul>",'',$text);
108 $text = str_replace("</ul>","\n",$text);
109 $text = str_replace("\xc2\xa0",' ',$text);
113 function unhtmlentities ($string) {
114 $trans_tbl = get_html_translation_table (HTML_SPECIALCHARS );
115 $trans_tbl = array_flip ($trans_tbl );
116 $res = strtr ($string ,$trans_tbl );
117 $res = str_replace("&quot;",'"',$res);
122 * SanitizeHtml - Grabs some text with all kinds of html code and parses it to make it safe
124 * @param string The HTML Code
125 * @return string The HTML output
127 function SanitizeHtml($input) {
129 $input = htmlspecialchars($input); // first strip all chars
131 $input=str_replace('&','&',$input);
132 $input=str_replace('"','"',$input);
133 $input=str_replace('/>','/>',$input);
134 $input=str_replace('">','">',$input);
135 $input=str_replace('</a>','</a>',$input);
136 $input=str_replace('<strike>','<strike>',$input);
137 $input=str_replace('</strike>','</strike>',$input);
138 $input=str_replace('<sub>','<sub>',$input);
139 $input=str_replace('</sub>','</sub>',$input);
140 $input=str_replace('<span','<span',$input);
141 $input=str_replace('</span>','</span>',$input);
142 $input=str_replace('<font','<font',$input);
143 $input=str_replace('</font>','</font>',$input);
144 $input=str_replace('<hr>','<hr>',$input);
145 $input=str_replace('<hr','<hr',$input);
146 $input=str_replace('<br>','<br>',$input);
147 $input=str_replace('<br />','<br />',$input);
148 $input=str_replace('<tbody>','<tbody>',$input);
149 $input=str_replace('</tbody>','</tbody>',$input);
150 $input=str_replace('<tr>','<tr>',$input);
151 $input=str_replace('</tr>','</tr>',$input);
152 $input=str_replace('<td>','<td>',$input);
153 $input=str_replace('</td>','</td>',$input);
154 $input=str_replace('<td','<td',$input);
155 $input=str_replace('<table>','<table>',$input);
156 $input=str_replace('<table','<table',$input);
157 $input=str_replace('</table>','</table>',$input);
158 $input=str_replace('<div>','<div>',$input);
159 $input=str_replace('<div','<div',$input);
160 $input=str_replace('</div>','</div>',$input);
161 $input=str_replace('<u>','<u>',$input);
162 $input=str_replace('<u ','<u ',$input); // rg
163 $input=str_replace('</u>','</u>',$input);
164 $input=str_replace('<p>','<p>',$input);
165 $input=str_replace('</p>','</p>',$input);
166 $input=str_replace('<p ','<p ',$input);
167 $input=str_replace('<li>','<li>',$input);
168 $input=str_replace('</li>','</li>',$input);
169 $input=str_replace('<ul>','<ul>',$input);
170 $input=str_replace('</ul>','</ul>',$input);
171 $input=str_replace('<ol>','<ol>',$input);
172 $input=str_replace('</ol>','</ol>',$input);
173 $input=str_replace('<blockquote>','<blockquote>',$input);
174 $input=str_replace('<blockquote','<blockquote',$input);
175 $input=str_replace('</blockquote>','</blockquote>',$input);
176 $input=str_replace('<em>','<em>',$input);
177 $input=str_replace('</em>','</em>',$input);
178 $input=str_replace('<strong>','<strong>',$input);
179 $input=str_replace('</strong>','</strong>',$input);
180 $input=str_replace('<sup>','<sup>',$input);
181 $input=str_replace('</sup>','</sup>',$input);
182 $input=str_replace('<input ','<input ',$input);
183 $input=str_replace('<img ','<img ',$input);
184 $input=str_replace('<textarea ','<textarea ',$input);
185 $input=str_replace('</textarea>','</textarea>',$input);
186 $input=str_replace('<a ','<a ',$input);
187 $input=str_replace('<h1>','<h1>',$input);
188 $input=str_replace('</h1>','</h1>',$input);
189 $input=str_replace('<h2>','<h2>',$input);
190 $input=str_replace('</h2>','</h2>',$input);
191 $input=str_replace('<h3>','<h3>',$input);
192 $input=str_replace('</h3>','</h3>',$input);
193 $input=str_replace('<h4>','<h4>',$input);
194 $input=str_replace('</h4>','</h4>',$input);
195 $input=str_replace('<h5>','<h5>',$input);
196 $input=str_replace('</h5>','</h5>',$input);
197 $input=str_replace('<h6>','<h6>',$input);
198 $input=str_replace('</h6>','</h6>',$input);
199 $input=str_replace('<pre>','<pre>',$input);
200 $input=str_replace('</pre>','</pre>',$input);
201 $input=str_replace('<address>','<address>',$input);
202 $input=str_replace('</address>','</address>',$input);
203 $input=str_replace('<h1 ','<h1 ',$input);
204 $input=str_replace('<h2 ','<h2 ',$input);
205 $input=str_replace('<h3 ','<h3 ',$input);
206 $input=str_replace('<h4 ','<h4 ',$input);
207 $input=str_replace('<h5 ','<h5 ',$input);
208 $input=str_replace('<h6 ','<h6 ',$input);
209 $input=str_replace('’','\\\'',$input);
210 $input=str_replace('•','-',$input);
212 // Allow embbeding video like youtube ones.
213 $input=str_replace('<object ','<object ',$input);
214 $input=str_replace('</object>','</object>',$input);
215 $input=str_replace('<param ','<param ',$input);
216 $input=str_replace('</param>','</param>',$input);
217 $input=str_replace('<embed ','<embed ',$input);
218 $input=str_replace('</embed>','</embed>',$input);
223 function stripTags ($text, $allowed='br,p,li,ul') {
224 $config = HTMLPurifier_Config::createDefault();
225 $config->set('Cache.DefinitionImpl', NULL);
226 $config->set('HTML.Allowed', $allowed);
227 $purifier = new HTMLPurifier($config);
228 $text = $purifier->purify($text);
233 static function purify ($text) {
234 // Remove string like "<![if !supportLists]>" or "<![endif]>"
235 $text = preg_replace('/<!\[.+?\]>/', '', $text);
236 $config = HTMLPurifier_Config::createDefault();
237 //$config->set('HTML.Allowed','a[href|title],strike,sub,span,font,hr,br,tbody,tr,td,table,div,u,p,ul,li,ol,blockquote,em,strong,sup,input,img,textarea,h1,h2,h3,h4,h5,h6,pre,address');
238 $config->set('Cache.DefinitionImpl', NULL);
239 $purifier = new HTMLPurifier($config);
240 return $purifier->purify($text);
243 function summarize ($text, $nb_line=4, $truncate=true, $nb_char=145) {
244 $text = $this->stripTags($text);
245 $text = $this->convertNeededTagsForEmail($text);
246 // Remove MS Windows extra char for CR
247 $text = preg_replace('/\r/', '', $text);
249 $text = preg_replace('/\n[\n\s]*/', "\n", $text);
251 $arr = explode("\n", $text);
252 $nb_max = count($arr);
253 if ($nb_max > $nb_line) $nb_max = $nb_line;
255 for ($l = 0; $l < $nb_max; $l++) {
256 $summary .= '<br />';
257 if ($truncate == true && $nb_max < $nb_line && $l == $nb_max - 1) {
258 $nb_char = $nb_char * ($nb_line - $nb_max + 1);
260 $summary .= util_make_links((($truncate == true && strlen($arr[$l]) > $nb_char) ?
261 preg_replace('/[^\s]*$/', ' <b>...</b>', substr($arr[$l], 0, $nb_char), 1) :
271 // c-file-style: "bsd"