1 package net.sf.xsshtmlfilter;
2
3 import java.util.ArrayList;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.concurrent.ConcurrentHashMap;
9 import java.util.concurrent.ConcurrentMap;
10 import java.util.logging.Logger;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47 public final class HTMLFilter {
48
49
50 private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
51 private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
52 private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$", REGEX_FLAGS_SI);
53 private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
54 private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
55 private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
56 private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
57 private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
58 private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
59 private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
60 private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
61 private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
62 private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
63 private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
64 private static final Pattern P_END_ARROW = Pattern.compile("^>");
65 private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
66 private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
67 private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
68 private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
69 private static final Pattern P_AMP = Pattern.compile("&");
70 private static final Pattern P_QUOTE = Pattern.compile("\"");
71 private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
72 private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
73 private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
74
75
76 private static final ConcurrentMap<String,Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>();
77 private static final ConcurrentMap<String,Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>();
78
79
80 private final Map<String, List<String>> vAllowed;
81
82 private final Map<String, Integer> vTagCounts = new HashMap<String, Integer>();
83
84
85 private final String[] vSelfClosingTags;
86
87 private final String[] vNeedClosingTags;
88
89 private final String[] vDisallowed;
90
91 private final String[] vProtocolAtts;
92
93 private final String[] vAllowedProtocols;
94
95 private final String[] vRemoveBlanks;
96
97 private final String[] vAllowedEntities;
98
99 private final boolean stripComment;
100 private final boolean encodeQuotes;
101 private boolean vDebug = false;
102
103
104
105
106
107 private final boolean alwaysMakeTags;
108
109
110
111
112 public HTMLFilter() {
113 vAllowed = new HashMap<String, List<String>>();
114
115 final ArrayList<String> a_atts = new ArrayList<String>();
116 a_atts.add("href");
117 a_atts.add("target");
118 vAllowed.put("a", a_atts);
119
120 final ArrayList<String> img_atts = new ArrayList<String>();
121 img_atts.add("src");
122 img_atts.add("width");
123 img_atts.add("height");
124 img_atts.add("alt");
125 vAllowed.put("img", img_atts);
126
127 final ArrayList<String> no_atts = new ArrayList<String>();
128 vAllowed.put("b", no_atts);
129 vAllowed.put("strong", no_atts);
130 vAllowed.put("i", no_atts);
131 vAllowed.put("em", no_atts);
132
133 vSelfClosingTags = new String[]{"img"};
134 vNeedClosingTags = new String[]{"a", "b", "strong", "i", "em"};
135 vDisallowed = new String[]{};
136 vAllowedProtocols = new String[]{"http", "mailto"};
137 vProtocolAtts = new String[]{"src", "href"};
138 vRemoveBlanks = new String[]{"a", "b", "strong", "i", "em"};
139 vAllowedEntities = new String[]{"amp", "gt", "lt", "quot"};
140 stripComment = true;
141 encodeQuotes = true;
142 alwaysMakeTags = true;
143 }
144
145
146
147
148
149 public HTMLFilter(final boolean debug) {
150 this();
151 vDebug = debug;
152
153 }
154
155
156
157
158
159 public HTMLFilter(final Map<String,Object> conf) {
160
161 assert conf.containsKey("vAllowed") : "configuration requires vAllowed";
162 assert conf.containsKey("vSelfClosingTags") : "configuration requires vSelfClosingTags";
163 assert conf.containsKey("vNeedClosingTags") : "configuration requires vNeedClosingTags";
164 assert conf.containsKey("vDisallowed") : "configuration requires vDisallowed";
165 assert conf.containsKey("vAllowedProtocols") : "configuration requires vAllowedProtocols";
166 assert conf.containsKey("vProtocolAtts") : "configuration requires vProtocolAtts";
167 assert conf.containsKey("vRemoveBlanks") : "configuration requires vRemoveBlanks";
168 assert conf.containsKey("vAllowedEntities") : "configuration requires vAllowedEntities";
169
170 vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) conf.get("vAllowed"));
171 vSelfClosingTags = (String[]) conf.get("vSelfClosingTags");
172 vNeedClosingTags = (String[]) conf.get("vNeedClosingTags");
173 vDisallowed = (String[]) conf.get("vDisallowed");
174 vAllowedProtocols = (String[]) conf.get("vAllowedProtocols");
175 vProtocolAtts = (String[]) conf.get("vProtocolAtts");
176 vRemoveBlanks = (String[]) conf.get("vRemoveBlanks");
177 vAllowedEntities = (String[]) conf.get("vAllowedEntities");
178 stripComment = conf.containsKey("stripComment") ? (Boolean) conf.get("stripComment") : true;
179 encodeQuotes = conf.containsKey("encodeQuotes") ? (Boolean) conf.get("encodeQuotes") : true;
180 alwaysMakeTags = conf.containsKey("alwaysMakeTags") ? (Boolean) conf.get("alwaysMakeTags") : true;
181 }
182
183 private void reset() {
184 vTagCounts.clear();
185 }
186
187 private void debug(final String msg) {
188 if (vDebug) {
189 Logger.getAnonymousLogger().info(msg);
190 }
191 }
192
193
194
195 public static String chr(final int decimal) {
196 return String.valueOf((char) decimal);
197 }
198
199 public static String htmlSpecialChars(final String s) {
200 String result = s;
201 result = regexReplace(P_AMP, "&", result);
202 result = regexReplace(P_QUOTE, """, result);
203 result = regexReplace(P_LEFT_ARROW, "<", result);
204 result = regexReplace(P_RIGHT_ARROW, ">", result);
205 return result;
206 }
207
208
209
210
211
212
213
214
215
216 public String filter(final String input) {
217 reset();
218 String s = input;
219
220 debug("************************************************");
221 debug(" INPUT: " + input);
222
223 s = escapeComments(s);
224 debug(" escapeComments: " + s);
225
226 s = balanceHTML(s);
227 debug(" balanceHTML: " + s);
228
229 s = checkTags(s);
230 debug(" checkTags: " + s);
231
232 s = processRemoveBlanks(s);
233 debug("processRemoveBlanks: " + s);
234
235 s = validateEntities(s);
236 debug(" validateEntites: " + s);
237
238 debug("************************************************\n\n");
239 return s;
240 }
241
242 public boolean isAlwaysMakeTags(){
243 return alwaysMakeTags;
244 }
245
246 public boolean isStripComments(){
247 return stripComment;
248 }
249
250 private String escapeComments(final String s) {
251 final Matcher m = P_COMMENTS.matcher(s);
252 final StringBuffer buf = new StringBuffer();
253 if (m.find()) {
254 final String match = m.group(1);
255 m.appendReplacement(buf, Matcher.quoteReplacement("<!--" + htmlSpecialChars(match) + "-->"));
256 }
257 m.appendTail(buf);
258
259 return buf.toString();
260 }
261
262 private String balanceHTML(String s) {
263 if (alwaysMakeTags) {
264
265
266
267 s = regexReplace(P_END_ARROW, "", s);
268 s = regexReplace(P_BODY_TO_END, "<$1>", s);
269 s = regexReplace(P_XML_CONTENT, "$1<$2", s);
270
271 } else {
272
273
274
275 s = regexReplace(P_STRAY_LEFT_ARROW, "<$1", s);
276 s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2><", s);
277
278
279
280
281
282
283 s = regexReplace(P_BOTH_ARROWS, "", s);
284 }
285
286 return s;
287 }
288
289 private String checkTags(String s) {
290 Matcher m = P_TAGS.matcher(s);
291
292 final StringBuffer buf = new StringBuffer();
293 while (m.find()) {
294 String replaceStr = m.group(1);
295 replaceStr = processTag(replaceStr);
296 m.appendReplacement(buf, Matcher.quoteReplacement(replaceStr));
297 }
298 m.appendTail(buf);
299
300 s = buf.toString();
301
302
303
304 for (String key : vTagCounts.keySet()) {
305 for (int ii = 0; ii < vTagCounts.get(key); ii++) {
306 s += "</" + key + ">";
307 }
308 }
309
310 return s;
311 }
312
313 private String processRemoveBlanks(final String s) {
314 String result = s;
315 for (String tag : vRemoveBlanks) {
316 if(!P_REMOVE_PAIR_BLANKS.containsKey(tag)){
317 P_REMOVE_PAIR_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag + ">"));
318 }
319 result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result);
320 if(!P_REMOVE_SELF_BLANKS.containsKey(tag)){
321 P_REMOVE_SELF_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?/>"));
322 }
323 result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result);
324 }
325
326 return result;
327 }
328
329 private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s) {
330 Matcher m = regex_pattern.matcher(s);
331 return m.replaceAll(replacement);
332 }
333
334 private String processTag(final String s) {
335
336 Matcher m = P_END_TAG.matcher(s);
337 if (m.find()) {
338 final String name = m.group(1).toLowerCase();
339 if (allowed(name)) {
340 if (!inArray(name, vSelfClosingTags)) {
341 if (vTagCounts.containsKey(name)) {
342 vTagCounts.put(name, vTagCounts.get(name) - 1);
343 return "</" + name + ">";
344 }
345 }
346 }
347 }
348
349
350 m = P_START_TAG.matcher(s);
351 if (m.find()) {
352 final String name = m.group(1).toLowerCase();
353 final String body = m.group(2);
354 String ending = m.group(3);
355
356
357 if (allowed(name)) {
358 String params = "";
359
360 final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body);
361 final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body);
362 final List<String> paramNames = new ArrayList<String>();
363 final List<String> paramValues = new ArrayList<String>();
364 while (m2.find()) {
365 paramNames.add(m2.group(1));
366 paramValues.add(m2.group(3));
367 }
368 while (m3.find()) {
369 paramNames.add(m3.group(1));
370 paramValues.add(m3.group(3));
371 }
372
373 String paramName, paramValue;
374 for (int ii = 0; ii < paramNames.size(); ii++) {
375 paramName = paramNames.get(ii).toLowerCase();
376 paramValue = paramValues.get(ii);
377
378
379
380
381
382 if (allowedAttribute(name, paramName)) {
383 if (inArray(paramName, vProtocolAtts)) {
384 paramValue = processParamProtocol(paramValue);
385 }
386 params += " " + paramName + "=\"" + paramValue + "\"";
387 }
388 }
389
390 if (inArray(name, vSelfClosingTags)) {
391 ending = " /";
392 }
393
394 if (inArray(name, vNeedClosingTags)) {
395 ending = "";
396 }
397
398 if (ending == null || ending.length() < 1) {
399 if (vTagCounts.containsKey(name)) {
400 vTagCounts.put(name, vTagCounts.get(name) + 1);
401 } else {
402 vTagCounts.put(name, 1);
403 }
404 } else {
405 ending = " /";
406 }
407 return "<" + name + params + ending + ">";
408 } else {
409 return "";
410 }
411 }
412
413
414 m = P_COMMENT.matcher(s);
415 if (!stripComment && m.find()) {
416 return "<" + m.group() + ">";
417 }
418
419 return "";
420 }
421
422 private String processParamProtocol(String s) {
423 s = decodeEntities(s);
424 final Matcher m = P_PROTOCOL.matcher(s);
425 if (m.find()) {
426 final String protocol = m.group(1);
427 if (!inArray(protocol, vAllowedProtocols)) {
428
429 s = "#" + s.substring(protocol.length() + 1, s.length());
430 if (s.startsWith("#//")) {
431 s = "#" + s.substring(3, s.length());
432 }
433 }
434 }
435
436 return s;
437 }
438
439 private String decodeEntities(String s) {
440 StringBuffer buf = new StringBuffer();
441
442 Matcher m = P_ENTITY.matcher(s);
443 while (m.find()) {
444 final String match = m.group(1);
445 final int decimal = Integer.decode(match).intValue();
446 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
447 }
448 m.appendTail(buf);
449 s = buf.toString();
450
451 buf = new StringBuffer();
452 m = P_ENTITY_UNICODE.matcher(s);
453 while (m.find()) {
454 final String match = m.group(1);
455 final int decimal = Integer.valueOf(match, 16).intValue();
456 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
457 }
458 m.appendTail(buf);
459 s = buf.toString();
460
461 buf = new StringBuffer();
462 m = P_ENCODE.matcher(s);
463 while (m.find()) {
464 final String match = m.group(1);
465 final int decimal = Integer.valueOf(match, 16).intValue();
466 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
467 }
468 m.appendTail(buf);
469 s = buf.toString();
470
471 s = validateEntities(s);
472 return s;
473 }
474
475 private String validateEntities(final String s) {
476 StringBuffer buf = new StringBuffer();
477
478
479 Matcher m = P_VALID_ENTITIES.matcher(s);
480 while (m.find()) {
481 final String one = m.group(1);
482 final String two = m.group(2);
483 m.appendReplacement(buf, Matcher.quoteReplacement(checkEntity(one, two)));
484 }
485 m.appendTail(buf);
486
487 return encodeQuotes(buf.toString());
488 }
489
490 private String encodeQuotes(final String s){
491 if(encodeQuotes){
492 StringBuffer buf = new StringBuffer();
493 Matcher m = P_VALID_QUOTES.matcher(s);
494 while (m.find()) {
495 final String one = m.group(1);
496 final String two = m.group(2);
497 final String three = m.group(3);
498 m.appendReplacement(buf, Matcher.quoteReplacement(one + regexReplace(P_QUOTE, """, two) + three));
499 }
500 m.appendTail(buf);
501 return buf.toString();
502 }else{
503 return s;
504 }
505 }
506
507 private String checkEntity(final String preamble, final String term) {
508
509 return ";".equals(term) && isValidEntity(preamble)
510 ? '&' + preamble
511 : "&" + preamble;
512 }
513
514 private boolean isValidEntity(final String entity) {
515 return inArray(entity, vAllowedEntities);
516 }
517
518 private static boolean inArray(final String s, final String[] array) {
519 for (String item : array) {
520 if (item != null && item.equals(s)) {
521 return true;
522 }
523 }
524 return false;
525 }
526
527 private boolean allowed(final String name) {
528 return (vAllowed.isEmpty() || vAllowed.containsKey(name)) && !inArray(name, vDisallowed);
529 }
530
531 private boolean allowedAttribute(final String name, final String paramName) {
532 return allowed(name) && (vAllowed.isEmpty() || vAllowed.get(name).contains(paramName));
533 }
534 }