001/* 002 * Shredzone Commons 003 * 004 * Copyright (C) 2012 Richard "Shred" Körber 005 * http://commons.shredzone.org 006 * 007 * This program is free software: you can redistribute it and/or modify 008 * it under the terms of the GNU Library General Public License as 009 * published by the Free Software Foundation, either version 3 of the 010 * License, or (at your option) any later version. 011 * 012 * This program is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 015 * GNU General Public License for more details. 016 * 017 * You should have received a copy of the GNU Library General Public License 018 * along with this program. If not, see <http://www.gnu.org/licenses/>. 019 */ 020package org.shredzone.commons.text.filter; 021 022import static java.util.stream.Collectors.toSet; 023 024import java.util.Arrays; 025import java.util.HashMap; 026import java.util.Map; 027import java.util.Objects; 028import java.util.Set; 029import java.util.regex.Matcher; 030import java.util.regex.Pattern; 031 032import edu.umd.cs.findbugs.annotations.Nullable; 033import org.shredzone.commons.text.TextFilter; 034 035/** 036 * A filter that simplifies a HTML text. Only a given set of HTML tags (and attributes) 037 * are accepted. Other HTML tags are completely removed. 038 * <p> 039 * This filter can be used to allow a site visitor to enter marked-up text, but remove 040 * everything that might be harmful or induce Cross Site Scripting. 041 * 042 * @author Richard "Shred" Körber 043 */ 044public class SimplifyHtmlFilter implements TextFilter { 045 046 private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]+(>|$)", Pattern.DOTALL); 047 private static final Pattern TAG_OPEN = Pattern.compile("<(\\w+)\\s*(.*?)\\s*(/?)>"); 048 private static final Pattern TAG_CLOSE = Pattern.compile("</(\\w+)\\s*>"); 049 050 private final Map<String, Set<String>> acceptedTags = new HashMap<>(); 051 052 /** 053 * Adds a tag that is accepted by this filter, with all its attributes. 054 * 055 * @param tag 056 * HTML tag that is accepted (without angle brackets, e.g. "strong") 057 */ 058 public void addAcceptedTag(String tag) { 059 addAcceptedTag(tag, (String[]) null); 060 } 061 062 /** 063 * Adds a tag that is accepted by this filter, along with accepted attributes. 064 * 065 * @param tag 066 * HTML tag that is accepted (without angle brackets, e.g. "img") 067 * @param attributes 068 * an array of accepted attributes (e.g. "src", "alt") 069 */ 070 public void addAcceptedTag(String tag, String... attributes) { 071 Objects.requireNonNull(tag); 072 073 if (attributes == null || attributes.length == 0) { 074 acceptedTags.put(tag.toLowerCase(), null); 075 return; 076 } 077 078 Set<String> attributeSet = Arrays.stream(attributes) 079 .map(String::toLowerCase) 080 .collect(toSet()); 081 082 acceptedTags.put(tag.toLowerCase(), attributeSet); 083 } 084 085 @Override 086 public CharSequence apply(CharSequence text) { 087 StringBuffer sb = new StringBuffer(text.length() * 11 / 10); 088 089 Matcher m = TAG_PATTERN.matcher(text); 090 while (m.find()) { 091 m.appendReplacement(sb, processTag(m.group(0))); 092 } 093 m.appendTail(sb); 094 095 return sb; 096 } 097 098 /** 099 * Processes a tag that was spotted. 100 * 101 * @param text 102 * Tag (complete tag including the angle brackets) 103 * @return Cleaned up tag, or empty string if the tag was not accepted 104 */ 105 private String processTag(CharSequence text) { 106 if (text.charAt(1) != '/') { 107 // Opening tag or empty element shorthand 108 Matcher m1 = TAG_OPEN.matcher(text); 109 if (m1.matches()) { 110 String tag = m1.group(1).toLowerCase(); 111 String attr = m1.group(2); 112 String closing = m1.group(3); 113 if (acceptedTags.containsKey(tag)) { 114 StringBuilder result = new StringBuilder(); 115 result.append('<').append(tag); 116 processAttributes(attr, result, acceptedTags.get(tag)); 117 if ("/".equals(closing)) { 118 result.append(" /"); 119 } 120 result.append('>'); 121 return result.toString(); 122 } 123 } 124 125 } else { 126 // Closing tag 127 Matcher m2 = TAG_CLOSE.matcher(text); 128 if (m2.matches()) { 129 String tag = m2.group(1).toLowerCase(); 130 if (acceptedTags.containsKey(tag)) { 131 return "</" + tag + ">"; 132 } 133 } 134 } 135 136 return ""; 137 } 138 139 /** 140 * Processes an attribute string and builds clean attributes if accepted. 141 * 142 * @param attr 143 * Raw attribute string 144 * @param result 145 * StringBuilder where to append clean attributes to 146 * @param accepted 147 * Set of accepted attributes, may be empty or {@code null} if any 148 * attribute is accepted 149 */ 150 private void processAttributes(String attr, StringBuilder result, @Nullable Set<String> accepted) { 151 // If we accept no attributes, we will not change the result anyways. 152 if (accepted == null || accepted.isEmpty()) { 153 return; 154 } 155 156 int pos = 0; 157 int max = attr.length(); 158 159 while (pos < max) { 160 // Attribute name 161 StringBuilder attrName = new StringBuilder(attr.length() * 11 / 10); 162 StringBuilder attrValue = null; 163 164 while (pos < max) { 165 char ch = attr.charAt(pos); 166 if (!Character.isLetterOrDigit(ch)) { 167 break; 168 } 169 attrName.append(ch); 170 pos++; 171 } 172 173 // Skip Whitespaces 174 while (pos < max && Character.isWhitespace(attr.charAt(pos))) { 175 pos++; 176 } 177 178 if (pos < max && attr.charAt(pos) == '=') { 179 attrValue = new StringBuilder(); 180 181 // Attribute with value 182 pos++; 183 184 // Skip Whitespaces 185 while (pos < max && Character.isWhitespace(attr.charAt(pos))) { 186 pos++; 187 } 188 189 if (pos < max) { 190 char quote = attr.charAt(pos); 191 if (quote == '"' || quote == '\'') { 192 pos++; // skip opening quote 193 194 while (pos < max && attr.charAt(pos) != quote) { 195 attrValue.append(attr.charAt(pos)); 196 pos++; 197 } 198 199 pos++; // skip closing quote 200 201 } else { 202 // Attributes without quotes, just copy to the end 203 while (pos < max && !Character.isWhitespace(attr.charAt(pos))) { 204 attrValue.append(attr.charAt(pos)); 205 pos++; 206 } 207 } 208 } 209 210 // Skip trailing whitespaces 211 while (pos < max && Character.isWhitespace(attr.charAt(pos))) { 212 pos++; 213 } 214 } 215 216 if (accepted.contains(attrName.toString().toLowerCase())) { 217 result.append(' ').append(attrName).append('='); 218 if (attrValue != null) { 219 result.append('"'); 220 // There should never be plain quotes in an attribute value! 221 result.append(attrValue.toString().replace("\"", """)); 222 result.append('"'); 223 } else { 224 result.append('"').append(attrName).append('"'); 225 } 226 } 227 } 228 } 229 230}