View Javadoc

1   package net.sf.xsshtmlfilter;
2   
3   import java.util.ArrayList;
4   import java.util.Collections;
5   import java.util.HashMap;
6   import java.util.List;
7   import java.util.Map;
8   import java.util.concurrent.ConcurrentHashMap;
9   import java.util.concurrent.ConcurrentMap;
10  import java.util.logging.Logger;
11  import java.util.regex.Matcher;
12  import java.util.regex.Pattern;
13  
14  /**
15   *
16   * HTML filtering utility for protecting against XSS (Cross Site Scripting).
17   *
18   * This code is licensed LGPLv3
19   *
20   * This code is a Java port of the original work in PHP by Cal Hendersen.
21   * http://code.iamcal.com/php/lib_filter/
22   *
23   * The trickiest part of the translation was handling the differences in regex handling
24   * between PHP and Java.  These resources were helpful in the process:
25   *
26   * http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
27   * http://us2.php.net/manual/en/reference.pcre.pattern.modifiers.php
28   * http://www.regular-expressions.info/modifiers.html
29   *
30   * A note on naming conventions: instance variables are prefixed with a "v"; global
31   * constants are in all caps.
32   *
33   * Sample use:
34   * String input = ...
35   * String clean = new HTMLFilter().filter( input );
36   *
37   * The class is not thread safe. Create a new instance if in doubt.
38   *
39   * If you find bugs or have suggestions on improvement (especially regarding
40   * performance), please contact us.  The latest version of this
41   * source, and our contact details, can be found at http://xss-html-filter.sf.net
42   *
43   * @author Joseph O'Connell
44   * @author Cal Hendersen
45   * @author Michael Semb Wever
46   */
47  public final class HTMLFilter {
48  
49      /** regex flag union representing /si modifiers in php **/
50      private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
51      private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
52      private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$", REGEX_FLAGS_SI);
53      private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
54      private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
55      private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
56      private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
57      private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
58      private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
59      private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
60      private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
61      private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
62      private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
63      private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
64      private static final Pattern P_END_ARROW = Pattern.compile("^>");
65      private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
66      private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
67      private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
68      private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
69      private static final Pattern P_AMP = Pattern.compile("&");
70      private static final Pattern P_QUOTE = Pattern.compile("\"");
71      private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
72      private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
73      private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
74  
75      // @xxx could grow large... maybe use sesat's ReferenceMap
76      private static final ConcurrentMap<String,Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>();
77      private static final ConcurrentMap<String,Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>();
78  
79      /** set of allowed html elements, along with allowed attributes for each element **/
80      private final Map<String, List<String>> vAllowed;
81      /** counts of open tags for each (allowable) html element **/
82      private final Map<String, Integer> vTagCounts = new HashMap<String, Integer>();
83  
84      /** html elements which must always be self-closing (e.g. "<img />") **/
85      private final String[] vSelfClosingTags;
86      /** html elements which must always have separate opening and closing tags (e.g. "<b></b>") **/
87      private final String[] vNeedClosingTags;
88      /** set of disallowed html elements **/
89      private final String[] vDisallowed;
90      /** attributes which should be checked for valid protocols **/
91      private final String[] vProtocolAtts;
92      /** allowed protocols **/
93      private final String[] vAllowedProtocols;
94      /** tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />") **/
95      private final String[] vRemoveBlanks;
96      /** entities allowed within html markup **/
97      private final String[] vAllowedEntities;
98      /** flag determining whether comments are allowed in input String. */
99      private final boolean stripComment;
100     private final boolean encodeQuotes;
101     private boolean vDebug = false;
102     /**
103      * flag determining whether to try to make tags when presented with "unbalanced"
104      * angle brackets (e.g. "<b text </b>" becomes "<b> text </b>").  If set to false,
105      * unbalanced angle brackets will be html escaped.
106      */
107     private final boolean alwaysMakeTags;
108 
109     /** Default constructor.
110      *
111      */
112     public HTMLFilter() {
113         vAllowed = new HashMap<String, List<String>>();
114 
115         final ArrayList<String> a_atts = new ArrayList<String>();
116         a_atts.add("href");
117         a_atts.add("target");
118         vAllowed.put("a", a_atts);
119 
120         final ArrayList<String> img_atts = new ArrayList<String>();
121         img_atts.add("src");
122         img_atts.add("width");
123         img_atts.add("height");
124         img_atts.add("alt");
125         vAllowed.put("img", img_atts);
126 
127         final ArrayList<String> no_atts = new ArrayList<String>();
128         vAllowed.put("b", no_atts);
129         vAllowed.put("strong", no_atts);
130         vAllowed.put("i", no_atts);
131         vAllowed.put("em", no_atts);
132 
133         vSelfClosingTags = new String[]{"img"};
134         vNeedClosingTags = new String[]{"a", "b", "strong", "i", "em"};
135         vDisallowed = new String[]{};
136         vAllowedProtocols = new String[]{"http", "mailto"}; // no ftp.
137         vProtocolAtts = new String[]{"src", "href"};
138         vRemoveBlanks = new String[]{"a", "b", "strong", "i", "em"};
139         vAllowedEntities = new String[]{"amp", "gt", "lt", "quot"};
140         stripComment = true;
141         encodeQuotes = true;
142         alwaysMakeTags = true;
143     }
144 
145     /** Set debug flag to true. Otherwise use default settings. See the default constructor.
146      *
147      * @param debug turn debug on with a true argument
148      */
149     public HTMLFilter(final boolean debug) {
150         this();
151         vDebug = debug;
152 
153     }
154 
155     /** Map-parameter configurable constructor.
156      *
157      * @param conf map containing configuration. keys match field names.
158      */
159     public HTMLFilter(final Map<String,Object> conf) {
160 
161         assert conf.containsKey("vAllowed") : "configuration requires vAllowed";
162         assert conf.containsKey("vSelfClosingTags") : "configuration requires vSelfClosingTags";
163         assert conf.containsKey("vNeedClosingTags") : "configuration requires vNeedClosingTags";
164         assert conf.containsKey("vDisallowed") : "configuration requires vDisallowed";
165         assert conf.containsKey("vAllowedProtocols") : "configuration requires vAllowedProtocols";
166         assert conf.containsKey("vProtocolAtts") : "configuration requires vProtocolAtts";
167         assert conf.containsKey("vRemoveBlanks") : "configuration requires vRemoveBlanks";
168         assert conf.containsKey("vAllowedEntities") : "configuration requires vAllowedEntities";
169 
170         vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) conf.get("vAllowed"));
171         vSelfClosingTags = (String[]) conf.get("vSelfClosingTags");
172         vNeedClosingTags = (String[]) conf.get("vNeedClosingTags");
173         vDisallowed = (String[]) conf.get("vDisallowed");
174         vAllowedProtocols = (String[]) conf.get("vAllowedProtocols");
175         vProtocolAtts = (String[]) conf.get("vProtocolAtts");
176         vRemoveBlanks = (String[]) conf.get("vRemoveBlanks");
177         vAllowedEntities = (String[]) conf.get("vAllowedEntities");
178         stripComment =  conf.containsKey("stripComment") ? (Boolean) conf.get("stripComment") : true;
179         encodeQuotes = conf.containsKey("encodeQuotes") ? (Boolean) conf.get("encodeQuotes") : true;
180         alwaysMakeTags = conf.containsKey("alwaysMakeTags") ? (Boolean) conf.get("alwaysMakeTags") : true;
181     }
182 
183     private void reset() {
184         vTagCounts.clear();
185     }
186 
187     private void debug(final String msg) {
188         if (vDebug) {
189             Logger.getAnonymousLogger().info(msg);
190         }
191     }
192 
193     //---------------------------------------------------------------
194     // my versions of some PHP library functions
195     public static String chr(final int decimal) {
196         return String.valueOf((char) decimal);
197     }
198 
199     public static String htmlSpecialChars(final String s) {
200         String result = s;
201         result = regexReplace(P_AMP, "&amp;", result);
202         result = regexReplace(P_QUOTE, "&quot;", result);
203         result = regexReplace(P_LEFT_ARROW, "&lt;", result);
204         result = regexReplace(P_RIGHT_ARROW, "&gt;", result);
205         return result;
206     }
207 
208     //---------------------------------------------------------------
209     /**
210      * given a user submitted input String, filter out any invalid or restricted
211      * html.
212      *
213      * @param input text (i.e. submitted by a user) than may contain html
214      * @return "clean" version of input, with only valid, whitelisted html elements allowed
215      */
216     public String filter(final String input) {
217         reset();
218         String s = input;
219 
220         debug("************************************************");
221         debug("              INPUT: " + input);
222 
223         s = escapeComments(s);
224         debug("     escapeComments: " + s);
225 
226         s = balanceHTML(s);
227         debug("        balanceHTML: " + s);
228 
229         s = checkTags(s);
230         debug("          checkTags: " + s);
231 
232         s = processRemoveBlanks(s);
233         debug("processRemoveBlanks: " + s);
234 
235         s = validateEntities(s);
236         debug("    validateEntites: " + s);
237 
238         debug("************************************************\n\n");
239         return s;
240     }
241 
242     public boolean isAlwaysMakeTags(){
243         return alwaysMakeTags;
244     }
245 
246     public boolean isStripComments(){
247         return stripComment;
248     }
249 
250     private String escapeComments(final String s) {
251         final Matcher m = P_COMMENTS.matcher(s);
252         final StringBuffer buf = new StringBuffer();
253         if (m.find()) {
254             final String match = m.group(1); //(.*?)
255             m.appendReplacement(buf, Matcher.quoteReplacement("<!--" + htmlSpecialChars(match) + "-->"));
256         }
257         m.appendTail(buf);
258 
259         return buf.toString();
260     }
261 
262     private String balanceHTML(String s) {
263         if (alwaysMakeTags) {
264             //
265             // try and form html
266             //
267             s = regexReplace(P_END_ARROW, "", s);
268             s = regexReplace(P_BODY_TO_END, "<$1>", s);
269             s = regexReplace(P_XML_CONTENT, "$1<$2", s);
270 
271         } else {
272             //
273             // escape stray brackets
274             //
275             s = regexReplace(P_STRAY_LEFT_ARROW, "&lt;$1", s);
276             s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2&gt;<", s);
277 
278             //
279             // the last regexp causes '<>' entities to appear
280             // (we need to do a lookahead assertion so that the last bracket can
281             // be used in the next pass of the regexp)
282             //
283             s = regexReplace(P_BOTH_ARROWS, "", s);
284         }
285 
286         return s;
287     }
288 
289     private String checkTags(String s) {
290         Matcher m = P_TAGS.matcher(s);
291 
292         final StringBuffer buf = new StringBuffer();
293         while (m.find()) {
294             String replaceStr = m.group(1);
295             replaceStr = processTag(replaceStr);
296             m.appendReplacement(buf, Matcher.quoteReplacement(replaceStr));
297         }
298         m.appendTail(buf);
299 
300         s = buf.toString();
301 
302         // these get tallied in processTag
303         // (remember to reset before subsequent calls to filter method)
304         for (String key : vTagCounts.keySet()) {
305             for (int ii = 0; ii < vTagCounts.get(key); ii++) {
306                 s += "</" + key + ">";
307             }
308         }
309 
310         return s;
311     }
312 
313     private String processRemoveBlanks(final String s) {
314         String result = s;
315         for (String tag : vRemoveBlanks) {
316             if(!P_REMOVE_PAIR_BLANKS.containsKey(tag)){
317                 P_REMOVE_PAIR_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag + ">"));
318             }
319             result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result);
320             if(!P_REMOVE_SELF_BLANKS.containsKey(tag)){
321                 P_REMOVE_SELF_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?/>"));
322             }
323             result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result);
324         }
325 
326         return result;
327     }
328 
329     private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s) {
330         Matcher m = regex_pattern.matcher(s);
331         return m.replaceAll(replacement);
332     }
333 
334     private String processTag(final String s) {
335         // ending tags
336         Matcher m = P_END_TAG.matcher(s);
337         if (m.find()) {
338             final String name = m.group(1).toLowerCase();
339             if (allowed(name)) {
340                 if (!inArray(name, vSelfClosingTags)) {
341                     if (vTagCounts.containsKey(name)) {
342                         vTagCounts.put(name, vTagCounts.get(name) - 1);
343                         return "</" + name + ">";
344                     }
345                 }
346             }
347         }
348 
349         // starting tags
350         m = P_START_TAG.matcher(s);
351         if (m.find()) {
352             final String name = m.group(1).toLowerCase();
353             final String body = m.group(2);
354             String ending = m.group(3);
355 
356             //debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
357             if (allowed(name)) {
358                 String params = "";
359 
360                 final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body);
361                 final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body);
362                 final List<String> paramNames = new ArrayList<String>();
363                 final List<String> paramValues = new ArrayList<String>();
364                 while (m2.find()) {
365                     paramNames.add(m2.group(1)); //([a-z0-9]+)
366                     paramValues.add(m2.group(3)); //(.*?)
367                 }
368                 while (m3.find()) {
369                     paramNames.add(m3.group(1)); //([a-z0-9]+)
370                     paramValues.add(m3.group(3)); //([^\"\\s']+)
371                 }
372 
373                 String paramName, paramValue;
374                 for (int ii = 0; ii < paramNames.size(); ii++) {
375                     paramName = paramNames.get(ii).toLowerCase();
376                     paramValue = paramValues.get(ii);
377 
378 //          debug( "paramName='" + paramName + "'" );
379 //          debug( "paramValue='" + paramValue + "'" );
380 //          debug( "allowed? " + vAllowed.get( name ).contains( paramName ) );
381 
382                     if (allowedAttribute(name, paramName)) {
383                         if (inArray(paramName, vProtocolAtts)) {
384                             paramValue = processParamProtocol(paramValue);
385                         }
386                         params += " " + paramName + "=\"" + paramValue + "\"";
387                     }
388                 }
389 
390                 if (inArray(name, vSelfClosingTags)) {
391                     ending = " /";
392                 }
393 
394                 if (inArray(name, vNeedClosingTags)) {
395                     ending = "";
396                 }
397 
398                 if (ending == null || ending.length() < 1) {
399                     if (vTagCounts.containsKey(name)) {
400                         vTagCounts.put(name, vTagCounts.get(name) + 1);
401                     } else {
402                         vTagCounts.put(name, 1);
403                     }
404                 } else {
405                     ending = " /";
406                 }
407                 return "<" + name + params + ending + ">";
408             } else {
409                 return "";
410             }
411         }
412 
413         // comments
414         m = P_COMMENT.matcher(s);
415         if (!stripComment && m.find()) {
416             return  "<" + m.group() + ">";
417         }
418 
419         return "";
420     }
421 
422     private String processParamProtocol(String s) {
423         s = decodeEntities(s);
424         final Matcher m = P_PROTOCOL.matcher(s);
425         if (m.find()) {
426             final String protocol = m.group(1);
427             if (!inArray(protocol, vAllowedProtocols)) {
428                 // bad protocol, turn into local anchor link instead
429                 s = "#" + s.substring(protocol.length() + 1, s.length());
430                 if (s.startsWith("#//")) {
431                     s = "#" + s.substring(3, s.length());
432                 }
433             }
434         }
435 
436         return s;
437     }
438 
439     private String decodeEntities(String s) {
440         StringBuffer buf = new StringBuffer();
441 
442         Matcher m = P_ENTITY.matcher(s);
443         while (m.find()) {
444             final String match = m.group(1);
445             final int decimal = Integer.decode(match).intValue();
446             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
447         }
448         m.appendTail(buf);
449         s = buf.toString();
450 
451         buf = new StringBuffer();
452         m = P_ENTITY_UNICODE.matcher(s);
453         while (m.find()) {
454             final String match = m.group(1);
455             final int decimal = Integer.valueOf(match, 16).intValue();
456             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
457         }
458         m.appendTail(buf);
459         s = buf.toString();
460 
461         buf = new StringBuffer();
462         m = P_ENCODE.matcher(s);
463         while (m.find()) {
464             final String match = m.group(1);
465             final int decimal = Integer.valueOf(match, 16).intValue();
466             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
467         }
468         m.appendTail(buf);
469         s = buf.toString();
470 
471         s = validateEntities(s);
472         return s;
473     }
474 
475     private String validateEntities(final String s) {
476         StringBuffer buf = new StringBuffer();
477 
478         // validate entities throughout the string
479         Matcher m = P_VALID_ENTITIES.matcher(s);
480         while (m.find()) {
481             final String one = m.group(1); //([^&;]*)
482             final String two = m.group(2); //(?=(;|&|$))
483             m.appendReplacement(buf, Matcher.quoteReplacement(checkEntity(one, two)));
484         }
485         m.appendTail(buf);
486 
487         return encodeQuotes(buf.toString());
488     }
489 
490     private String encodeQuotes(final String s){
491         if(encodeQuotes){
492             StringBuffer buf = new StringBuffer();
493             Matcher m = P_VALID_QUOTES.matcher(s);
494             while (m.find()) {
495                 final String one = m.group(1); //(>|^)
496                 final String two = m.group(2); //([^<]+?)
497                 final String three = m.group(3); //(<|$)
498                 m.appendReplacement(buf, Matcher.quoteReplacement(one + regexReplace(P_QUOTE, "&quot;", two) + three));
499             }
500             m.appendTail(buf);
501             return buf.toString();
502         }else{
503             return s;
504         }
505     }
506 
507     private String checkEntity(final String preamble, final String term) {
508 
509         return ";".equals(term) && isValidEntity(preamble)
510                 ? '&' + preamble
511                 : "&amp;" + preamble;
512     }
513 
514     private boolean isValidEntity(final String entity) {
515         return inArray(entity, vAllowedEntities);
516     }
517 
518     private static boolean inArray(final String s, final String[] array) {
519         for (String item : array) {
520             if (item != null && item.equals(s)) {
521                 return true;
522             }
523         }
524         return false;
525     }
526 
527     private boolean allowed(final String name) {
528         return (vAllowed.isEmpty() || vAllowed.containsKey(name)) && !inArray(name, vDisallowed);
529     }
530 
531     private boolean allowedAttribute(final String name, final String paramName) {
532         return allowed(name) && (vAllowed.isEmpty() || vAllowed.get(name).contains(paramName));
533     }
534 }