001/*
002 * Shredzone Commons
003 *
004 * Copyright (C) 2012 Richard "Shred" Körber
005 *   http://commons.shredzone.org
006 *
007 * This program is free software: you can redistribute it and/or modify
008 * it under the terms of the GNU Library General Public License as
009 * published by the Free Software Foundation, either version 3 of the
010 * License, or (at your option) any later version.
011 *
012 * This program is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
015 * GNU General Public License for more details.
016 *
017 * You should have received a copy of the GNU Library General Public License
018 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
019 */
020
021package org.shredzone.commons.view.util;
022
023import java.net.URLDecoder;
024import java.net.URLEncoder;
025import java.nio.charset.StandardCharsets;
026import java.text.Normalizer;
027import java.util.regex.Matcher;
028import java.util.regex.Pattern;
029
030import javax.annotation.Nonnull;
031import javax.annotation.ParametersAreNonnullByDefault;
032
033/**
034 * Utility methods for view path management.
035 *
036 * @author Richard "Shred" Körber
037 */
038@ParametersAreNonnullByDefault
039public final class PathUtils {
040
041    private PathUtils() {
042        // Utility class without constructor
043    }
044
045    /**
046     * Simplifies a path part. The resulting string only contains numbers ([0-9]) and
047     * lowercase characters ([a-z]). One ore more consecutive whitespaces or a few
048     * non-ascii characters are converted into a single dash '-'. All other characters are
049     * either converted to ASCII characters, or removed.
050     * <p>
051     * This method can be used to convert e.g. titles into URL parts, for search engine
052     * optimization.
053     * <p>
054     * On accented characters, the accent is removed. However, currently German umlauts
055     * are converted into their respective ASCII counterparts ('ä' -&gt; 'ae'). Future
056     * implementations may also contain translations for other language's accented
057     * characters.
058     * <p>
059     * Consider this method as one-way encoding. Future releases may return different
060     * strings.
061     *
062     * @param part
063     *            path part to simplify
064     * @return simplified path part
065     */
066    public static @Nonnull String simplify(String part) {
067        StringBuilder result = new StringBuilder(part.length());
068
069        boolean lastWasDash = false;
070
071        for (char ch : part.toLowerCase().toCharArray()) {
072            if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'z')) {
073                result.append(ch);
074                lastWasDash = false;
075
076            } else if (ch == ' ' || ch == '+' || ch == '-' || ch == '_' || ch == '&') {
077                if (!lastWasDash) {
078                    result.append('-');
079                }
080                lastWasDash = true;
081
082            } else if (ch >= 128) {
083                // TODO: German-centric... Is there an international implementation?
084                switch (ch) {
085                    case 'ä':
086                    case 'Ä':
087                        result.append("ae");
088                        lastWasDash = false;
089                        break;
090
091                    case 'ö':
092                    case 'Ö':
093                        result.append("oe");
094                        lastWasDash = false;
095                        break;
096
097                    case 'ü':
098                    case 'Ü':
099                        result.append("ue");
100                        lastWasDash = false;
101                        break;
102
103                    case 'ß':
104                        result.append("ss");
105                        lastWasDash = false;
106                        break;
107
108                    default:
109                        String normalized = Normalizer.normalize(Character.toString(ch), Normalizer.Form.NFKD);
110                        for (char nch : normalized.toLowerCase().toCharArray()) {
111                            if (Character.isLetterOrDigit(nch)) {
112                                lastWasDash = false;
113                                result.append(nch);
114                            }
115                        }
116                }
117            }
118        }
119
120        return result.toString();
121    }
122
123    /**
124     * Suggests a file name suffix for the given content type.
125     * <p>
126     * The current implementation only detects the standard HTML image types.
127     *
128     * @param mime
129     *            content type to find a suffix for
130     * @return suggested suffix, or "bin" if there is no known suffix
131     */
132    public static @Nonnull String suffix(String mime) {
133        // Prominent Mime Types
134        switch (mime) {
135            case "image/png":     return "png";
136            case "image/jpeg":    return "jpg";
137            case "image/gif":     return "gif";
138            case "image/svg+xml": return "svg";
139            case "image/tiff":    return "tif";
140        }
141
142        // Try to guess
143        Matcher m = Pattern.compile("^.*?/(.{1,6}?)(\\+.*)?$").matcher(mime);
144        if (m.matches()) {
145            return m.group(1);
146        }
147
148        // Is it a text?
149        if (mime.startsWith("text/")) {
150            return "txt";
151        }
152
153        // Fallback to bin
154        return "bin";
155    }
156
157    /**
158     * URL encodes a string. utf-8 charset is used for encoding.
159     * <p>
160     * This is a convenience call of {@link URLEncoder#encode(String, String)} with
161     * exception handling.
162     *
163     * @param string
164     *            string to be URL encoded
165     * @return encoded string
166     */
167    public static @Nonnull String encode(String string) {
168        return URLEncoder.encode(string, StandardCharsets.UTF_8);
169    }
170
171    /**
172     * URL decodes a string. utf-8 charset is used for decoding.
173     * <p>
174     * This is a convenience call of {@link URLDecoder#decode(String, String)} with
175     * exception handling.
176     *
177     * @param string
178     *            the string to be URL decoded
179     * @return decoded string
180     */
181    public static @Nonnull String decode(String string) {
182        return URLDecoder.decode(string, StandardCharsets.UTF_8);
183    }
184
185}