001/* 002 * Shredzone Commons 003 * 004 * Copyright (C) 2012 Richard "Shred" Körber 005 * http://commons.shredzone.org 006 * 007 * This program is free software: you can redistribute it and/or modify 008 * it under the terms of the GNU Library General Public License as 009 * published by the Free Software Foundation, either version 3 of the 010 * License, or (at your option) any later version. 011 * 012 * This program is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 015 * GNU General Public License for more details. 016 * 017 * You should have received a copy of the GNU Library General Public License 018 * along with this program. If not, see <http://www.gnu.org/licenses/>. 019 */ 020package org.shredzone.commons.text.filter; 021 022import java.util.regex.Matcher; 023import java.util.regex.Pattern; 024 025import org.shredzone.commons.text.TextFilter; 026 027/** 028 * A filter that detects HTML hyperlinks, and adds a {@code rel="nofollow"} attribute. 029 * This filter can be used to post-process HTML content created by a site visitor, so 030 * web crawlers won't follow their links. 031 * 032 * @author Richard "Shred" Körber 033 */ 034public class NofollowLinksFilter implements TextFilter { 035 036 private static final Pattern HREF_PATTERN = Pattern.compile( 037 "(.*?)(<a[^>]+?href\\s*=\\s*[\"']?(?:https?|ftp|mailto|file):.+?)>", 038 Pattern.CASE_INSENSITIVE); 039 040 @Override 041 public CharSequence apply(CharSequence text) { 042 Matcher m = HREF_PATTERN.matcher(text); 043 if (!m.matches()) { 044 return text; 045 } 046 return m.replaceAll("$1$2 rel=\"nofollow\">"); 047 } 048 049}