001/* 002 * Shredzone Commons 003 * 004 * Copyright (C) 2012 Richard "Shred" Körber 005 * http://commons.shredzone.org 006 * 007 * This program is free software: you can redistribute it and/or modify 008 * it under the terms of the GNU Library General Public License as 009 * published by the Free Software Foundation, either version 3 of the 010 * License, or (at your option) any later version. 011 * 012 * This program is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 015 * GNU General Public License for more details. 016 * 017 * You should have received a copy of the GNU Library General Public License 018 * along with this program. If not, see <http://www.gnu.org/licenses/>. 019 */ 020package org.shredzone.commons.text.filter; 021 022import java.util.regex.Matcher; 023import java.util.regex.Pattern; 024 025import org.shredzone.commons.text.TextFilter; 026 027/** 028 * A filter that strips HTML markup from a text. 029 * <p> 030 * This filter distinguishes between block and inline tags. Block tags are replaced by a 031 * whitespace (unless there already was a whitespace before the tag), while inline tags 032 * are just removed. 033 * <p> 034 * Rationale: "foo<br>bar" is converted to "foo bar", while 035 * "foo<i>bar</i>" is converted to "foobar", similar to the way it is 036 * displayed in a web browser. 037 * 038 * @author Richard "Shred" Körber 039 */ 040public class StripHtmlFilter implements TextFilter { 041 042 // Inline tags that do not need to be replaced by a whitespace 043 private static final Pattern INLINE_TAGS = Pattern.compile("code|em|strong|samp|" + 044 "kbd|var|cite|dfn|abbr|acronym|q|del|ins|bdo|b|i|u|tt|s|strike|big|small|" + 045 "sup|sub|span|img", Pattern.CASE_INSENSITIVE); 046 047 private static final Pattern TAG_PATTERN = Pattern.compile("</?([a-zA-Z0-9]+)(?:\\\"[^\"]*\\\"|[^>])*>", Pattern.DOTALL); 048 049 @Override 050 public CharSequence apply(CharSequence text) { 051 StringBuffer sb = new StringBuffer(); 052 053 Matcher m = TAG_PATTERN.matcher(text); 054 while (m.find()) { 055 String tag = m.group(1); 056 057 boolean isInline = INLINE_TAGS.matcher(tag).matches(); 058 if (!isInline && sb.length() > 0 && !Character.isWhitespace(sb.charAt(sb.length() - 1))) { 059 m.appendReplacement(sb, " "); 060 } else { 061 m.appendReplacement(sb, ""); 062 } 063 } 064 m.appendTail(sb); 065 066 if (sb.length() > 0 && Character.isWhitespace(sb.charAt(sb.length() - 1))) { 067 sb.deleteCharAt(sb.length() - 1); 068 } 069 070 return sb; 071 } 072 073}