001/*
002 * Shredzone Commons
003 *
004 * Copyright (C) 2012 Richard "Shred" Körber
005 *   http://commons.shredzone.org
006 *
007 * This program is free software: you can redistribute it and/or modify
008 * it under the terms of the GNU Library General Public License as
009 * published by the Free Software Foundation, either version 3 of the
010 * License, or (at your option) any later version.
011 *
012 * This program is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
015 * GNU General Public License for more details.
016 *
017 * You should have received a copy of the GNU Library General Public License
018 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
019 */
020package org.shredzone.commons.text.filter;
021
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025import org.shredzone.commons.text.TextFilter;
026
027/**
028 * A filter that strips HTML markup from a text.
029 * <p>
030 * This filter distinguishes between block and inline tags. Block tags are replaced by a
031 * whitespace (unless there already was a whitespace before the tag), while inline tags
032 * are just removed.
033 * <p>
034 * Rationale: "foo&lt;br&gt;bar" is converted to "foo bar", while
035 * "foo&lt;i&gt;bar&lt;/i&gt;" is converted to "foobar", similar to the way it is
036 * displayed in a web browser.
037 *
038 * @author Richard "Shred" Körber
039 */
040public class StripHtmlFilter implements TextFilter {
041
042    // Inline tags that do not need to be replaced by a whitespace
043    private static final Pattern INLINE_TAGS = Pattern.compile("code|em|strong|samp|" +
044            "kbd|var|cite|dfn|abbr|acronym|q|del|ins|bdo|b|i|u|tt|s|strike|big|small|" +
045            "sup|sub|span|img", Pattern.CASE_INSENSITIVE);
046
047    private static final Pattern TAG_PATTERN = Pattern.compile("</?([a-zA-Z0-9]+)(?:\\\"[^\"]*\\\"|[^>])*>", Pattern.DOTALL);
048
049    @Override
050    public CharSequence apply(CharSequence text) {
051        StringBuffer sb = new StringBuffer();
052
053        Matcher m = TAG_PATTERN.matcher(text);
054        while (m.find()) {
055            String tag = m.group(1);
056
057            boolean isInline = INLINE_TAGS.matcher(tag).matches();
058            if (!isInline && sb.length() > 0 && !Character.isWhitespace(sb.charAt(sb.length() - 1))) {
059                m.appendReplacement(sb, " ");
060            } else {
061                m.appendReplacement(sb, "");
062            }
063        }
064        m.appendTail(sb);
065
066        if (sb.length() > 0 && Character.isWhitespace(sb.charAt(sb.length() - 1))) {
067            sb.deleteCharAt(sb.length() - 1);
068        }
069
070        return sb;
071    }
072
073}