001/*
002 * Shredzone Commons
003 *
004 * Copyright (C) 2012 Richard "Shred" Körber
005 *   http://commons.shredzone.org
006 *
007 * This program is free software: you can redistribute it and/or modify
008 * it under the terms of the GNU Library General Public License as
009 * published by the Free Software Foundation, either version 3 of the
010 * License, or (at your option) any later version.
011 *
012 * This program is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
015 * GNU General Public License for more details.
016 *
017 * You should have received a copy of the GNU Library General Public License
018 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
019 */
020package org.shredzone.commons.text.filter;
021
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025import org.shredzone.commons.text.TextFilter;
026
027/**
028 * A filter that detects HTML hyperlinks, and adds a {@code rel="nofollow"} attribute.
029 * This filter can be used to post-process HTML content created by a site visitor, so
030 * web crawlers won't follow their links.
031 *
032 * @author Richard "Shred" Körber
033 */
034public class NofollowLinksFilter implements TextFilter {
035
036    private static final Pattern HREF_PATTERN = Pattern.compile(
037            "(.*?)(<a[^>]+?href\\s*=\\s*[\"']?(?:https?|ftp|mailto|file):.+?)>",
038            Pattern.CASE_INSENSITIVE);
039
040    @Override
041    public CharSequence apply(CharSequence text) {
042        Matcher m = HREF_PATTERN.matcher(text);
043        if (!m.matches()) {
044            return text;
045        }
046        return m.replaceAll("$1$2 rel=\"nofollow\">");
047    }
048
049}