001/*
002 * Shredzone Commons
003 *
004 * Copyright (C) 2012 Richard "Shred" Körber
005 *   http://commons.shredzone.org
006 *
007 * This program is free software: you can redistribute it and/or modify
008 * it under the terms of the GNU Library General Public License as
009 * published by the Free Software Foundation, either version 3 of the
010 * License, or (at your option) any later version.
011 *
012 * This program is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
015 * GNU General Public License for more details.
016 *
017 * You should have received a copy of the GNU Library General Public License
018 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
019 */
020package org.shredzone.commons.text.filter;
021
022import static java.util.stream.Collectors.toSet;
023
024import java.util.Arrays;
025import java.util.HashMap;
026import java.util.Map;
027import java.util.Objects;
028import java.util.Set;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031
032import edu.umd.cs.findbugs.annotations.Nullable;
033import org.shredzone.commons.text.TextFilter;
034
035/**
036 * A filter that simplifies a HTML text. Only a given set of HTML tags (and attributes)
037 * are accepted. Other HTML tags are completely removed.
038 * <p>
039 * This filter can be used to allow a site visitor to enter marked-up text, but remove
040 * everything that might be harmful or induce Cross Site Scripting.
041 *
042 * @author Richard "Shred" Körber
043 */
044public class SimplifyHtmlFilter implements TextFilter {
045
046    private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]+(>|$)", Pattern.DOTALL);
047    private static final Pattern TAG_OPEN = Pattern.compile("<(\\w+)\\s*(.*?)\\s*(/?)>");
048    private static final Pattern TAG_CLOSE = Pattern.compile("</(\\w+)\\s*>");
049
050    private final Map<String, Set<String>> acceptedTags = new HashMap<>();
051
052    /**
053     * Adds a tag that is accepted by this filter, with all its attributes.
054     *
055     * @param tag
056     *            HTML tag that is accepted (without angle brackets, e.g. "strong")
057     */
058    public void addAcceptedTag(String tag) {
059        addAcceptedTag(tag, (String[]) null);
060    }
061
062    /**
063     * Adds a tag that is accepted by this filter, along with accepted attributes.
064     *
065     * @param tag
066     *            HTML tag that is accepted (without angle brackets, e.g. "img")
067     * @param attributes
068     *            an array of accepted attributes (e.g. "src", "alt")
069     */
070    public void addAcceptedTag(String tag, String... attributes) {
071        Objects.requireNonNull(tag);
072
073        if (attributes == null || attributes.length == 0) {
074            acceptedTags.put(tag.toLowerCase(), null);
075            return;
076        }
077
078        Set<String> attributeSet = Arrays.stream(attributes)
079                .map(String::toLowerCase)
080                .collect(toSet());
081
082        acceptedTags.put(tag.toLowerCase(), attributeSet);
083    }
084
085    @Override
086    public CharSequence apply(CharSequence text) {
087        StringBuffer sb = new StringBuffer(text.length() * 11 / 10);
088
089        Matcher m = TAG_PATTERN.matcher(text);
090        while (m.find()) {
091            m.appendReplacement(sb, processTag(m.group(0)));
092        }
093        m.appendTail(sb);
094
095        return sb;
096    }
097
098    /**
099     * Processes a tag that was spotted.
100     *
101     * @param text
102     *            Tag (complete tag including the angle brackets)
103     * @return Cleaned up tag, or empty string if the tag was not accepted
104     */
105    private String processTag(CharSequence text) {
106        if (text.charAt(1) != '/') {
107            // Opening tag or empty element shorthand
108            Matcher m1 = TAG_OPEN.matcher(text);
109            if (m1.matches()) {
110                String tag = m1.group(1).toLowerCase();
111                String attr = m1.group(2);
112                String closing = m1.group(3);
113                if (acceptedTags.containsKey(tag)) {
114                    StringBuilder result = new StringBuilder();
115                    result.append('<').append(tag);
116                    processAttributes(attr, result, acceptedTags.get(tag));
117                    if ("/".equals(closing)) {
118                        result.append(" /");
119                    }
120                    result.append('>');
121                    return result.toString();
122                }
123            }
124
125        } else {
126            // Closing tag
127            Matcher m2 = TAG_CLOSE.matcher(text);
128            if (m2.matches()) {
129                String tag = m2.group(1).toLowerCase();
130                if (acceptedTags.containsKey(tag)) {
131                    return "</" + tag + ">";
132                }
133            }
134        }
135
136        return "";
137    }
138
139    /**
140     * Processes an attribute string and builds clean attributes if accepted.
141     *
142     * @param attr
143     *            Raw attribute string
144     * @param result
145     *            StringBuilder where to append clean attributes to
146     * @param accepted
147     *            Set of accepted attributes, may be empty or {@code null} if any
148     *            attribute is accepted
149     */
150    private void processAttributes(String attr, StringBuilder result, @Nullable Set<String> accepted) {
151        // If we accept no attributes, we will not change the result anyways.
152        if (accepted == null || accepted.isEmpty()) {
153            return;
154        }
155
156        int pos = 0;
157        int max = attr.length();
158
159        while (pos < max) {
160            // Attribute name
161            StringBuilder attrName = new StringBuilder(attr.length() * 11 / 10);
162            StringBuilder attrValue = null;
163
164            while (pos < max) {
165                char ch = attr.charAt(pos);
166                if (!Character.isLetterOrDigit(ch)) {
167                    break;
168                }
169                attrName.append(ch);
170                pos++;
171            }
172
173            // Skip Whitespaces
174            while (pos < max && Character.isWhitespace(attr.charAt(pos))) {
175                pos++;
176            }
177
178            if (pos < max && attr.charAt(pos) == '=') {
179                attrValue = new StringBuilder();
180
181                // Attribute with value
182                pos++;
183
184                // Skip Whitespaces
185                while (pos < max && Character.isWhitespace(attr.charAt(pos))) {
186                    pos++;
187                }
188
189                if (pos < max) {
190                    char quote = attr.charAt(pos);
191                    if (quote == '"' || quote == '\'') {
192                        pos++;  // skip opening quote
193
194                        while (pos < max && attr.charAt(pos) != quote) {
195                            attrValue.append(attr.charAt(pos));
196                            pos++;
197                        }
198
199                        pos++;  // skip closing quote
200
201                    } else {
202                        // Attributes without quotes, just copy to the end
203                        while (pos < max && !Character.isWhitespace(attr.charAt(pos))) {
204                            attrValue.append(attr.charAt(pos));
205                            pos++;
206                        }
207                    }
208                }
209
210                // Skip trailing whitespaces
211                while (pos < max && Character.isWhitespace(attr.charAt(pos))) {
212                    pos++;
213                }
214            }
215
216            if (accepted.contains(attrName.toString().toLowerCase())) {
217                result.append(' ').append(attrName).append('=');
218                if (attrValue != null) {
219                    result.append('"');
220                    // There should never be plain quotes in an attribute value!
221                    result.append(attrValue.toString().replace("\"", "&quot;"));
222                    result.append('"');
223                } else {
224                    result.append('"').append(attrName).append('"');
225                }
226            }
227        }
228    }
229
230}