001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2014  Oliver Burn
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019package com.puppycrawl.tools.checkstyle.checks;
020
021import java.util.regex.Matcher;
022import java.util.regex.Pattern;
023
024import com.puppycrawl.tools.checkstyle.api.Check;
025import com.puppycrawl.tools.checkstyle.api.DetailAST;
026import com.puppycrawl.tools.checkstyle.api.TokenTypes;
027import com.puppycrawl.tools.checkstyle.api.Utils;
028
029/**
030 * <p>
031 * Restrict using <a href =
032 * "http://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.3">
033 * Unicode escapes</a> (e.g. \u221e).
034 * It is possible to allow using escapes for
035 * <a href="http://en.wiktionary.org/wiki/Appendix:Control_characters">
036 * non-printable(control) characters</a>.
037 * Also, this check can be configured to allow using escapes
038 * if trail comment is present. By the option it is possible to
039 * allow using escapes if literal contains only them. By the option it
040 * is possible to allow using escapes for space literals.
041 * </p>
042 * <p>
043 * Examples of using Unicode:</p>
044 * <pre>
045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment.
046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is.
047 * </pre>
048 * <p>
049 * An example of how to configure the check is:
050 * </p>
051 * <pre>
052 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
053 * </pre>
054 * <p>
055 * An example of non-printable(control) characters.
056 * </p>
057 * <pre>
058 * return '\ufeff' + content; // byte order mark
059 * </pre>
060 * <p>
061 * An example of how to configure the check to allow using escapes
062 * for non-printable(control) characters:
063 * </p>
064 * <pre>
065 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
066 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
067 * &lt;/module&gt;
068 * </pre>
069 * <p>
070 * Example of using escapes with trail comment:
071 * </p>
072 * <pre>
073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s"
074 * </pre>
075 * <p>An example of how to configure the check to allow using escapes
076 * if trail comment is present:
077 * </p>
078 * <pre>
079 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
080 *     &lt;property name="allowByTailComment" value="true"/&gt;
081 * &lt;/module&gt;
082 * </pre>
083 * <p>Example of using escapes if literal contains only them:
084 * </p>
085 * <pre>
086 * String unitAbbrev = "\u03bc\u03bc\u03bc";
087 * </pre>
088 * <p>An example of how to configure the check to allow escapes
089 * if literal contains only them:
090 * </p>
091 * <pre>
092 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
093 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
094 * &lt;/module&gt;
095 * </pre>
096 * <p>An example of how to configure the check to allow non-printable escapes:
097 * </p>
098 * <pre>
099 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
100 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
101 * &lt;/module&gt;
102 * </pre>
103 *
104 * @author maxvetrenko
105 *
106 */
107public class AvoidEscapedUnicodeCharactersCheck
108    extends Check
109{
110     /** Regexp for Unicode chars */
111    private static Pattern sUnicodeRegexp =
112            Utils.getPattern("\\\\u[a-fA-F0-9]{4}");
113
114    /** Regexp Unicode control characters */
115    private static Pattern sUnicodeControl = Utils.getPattern("\\\\(u|U)"
116            + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)"
117            + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]"
118            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
119
120    /** Regexp for trail comment */
121    private static Pattern sCommentRegexp = Utils.getPattern(";[ ]*//+"
122            + "[a-zA-Z0-9 ]*|;[ ]*/[*]{1}+[a-zA-Z0-9 ]*");
123
124    /** Regexp for all escaped chars*/
125    private static Pattern sAllEscapedChars =
126            Utils.getPattern("^((\\\\u)[a-fA-F0-9]{4}"
127                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\\\"|\\\')+$");
128
129    /** Regexp for non-printable unicode chars*/
130    private static Pattern sNonPrintableChars = Utils.getPattern("\\\\u1680|\\\\u2028"
131            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
132            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
133            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
134            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
135            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
136            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
137            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
138            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
139            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
140            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
141            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
142            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
143            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
144            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
145            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
146            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
147            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
148            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
149            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
150
151    /** Allow use escapes for non-printable(control) characters.  */
152    private boolean allowEscapesForControlCharacters;
153
154    /** Allow use escapes if trail comment is present*/
155    private boolean allowByTailComment;
156
157    /** Allow if all characters in literal are excaped*/
158    private boolean allowIfAllCharactersEscaped;
159
160    /** Allow escapes for space literals*/
161    private boolean allowNonPrintableEscapes;
162
163    /**
164     * Set allowIfAllCharactersEscaped.
165     * @param allow user's value.
166     */
167    public final void setAllowEscapesForControlCharacters(boolean allow)
168    {
169        allowEscapesForControlCharacters = allow;
170    }
171
172    /**
173     * Set allowByTailComment.
174     * @param allow user's value.
175     */
176    public final void setAllowByTailComment(boolean allow)
177    {
178        allowByTailComment = allow;
179    }
180
181    /**
182     * Set allowIfAllCharactersEscaped.
183     * @param allow user's value.
184     */
185    public final void setAllowIfAllCharactersEscaped(boolean allow)
186    {
187        allowIfAllCharactersEscaped = allow;
188    }
189
190    /**
191     * Set allowSpaceEscapes.
192     * @param allow user's value.
193     */
194    public final void setAllowNonPrintableEscapes(boolean allow)
195    {
196        allowNonPrintableEscapes = allow;
197    }
198
199    @Override
200    public int[] getDefaultTokens()
201    {
202        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
203    }
204
205    @Override
206    public void visitToken(DetailAST ast)
207    {
208
209        final String literal = ast.getText();
210
211        if (hasUnicodeChar(literal)) {
212            if (!(allowByTailComment && haastrailComment(ast)
213                    || isAllCharactersEscaped(literal)
214                    || (allowEscapesForControlCharacters
215                            && isOnlyUnicodeValidChars(literal, sUnicodeControl))
216                    || (allowNonPrintableEscapes
217                            && isOnlyUnicodeValidChars(literal, sNonPrintableChars))))
218            {
219                log(ast.getLineNo(), "forbid.escaped.unicode.char");
220            }
221        }
222    }
223
224    /**
225     * Checks if literal has Unicode chars.
226     * @param literal String literal.
227     * @return true if literal has Unicode chars.
228     */
229    private boolean hasUnicodeChar(String literal)
230    {
231        return sUnicodeRegexp.matcher(literal).find();
232    }
233
234    /**
235     * Check if String literal contains Unicode control chars.
236     * @param literal String llteral.
237     * @param pattern RegExp for valid characters.
238     * @return true, if String literal contains Unicode control chars.
239     */
240    private boolean isOnlyUnicodeValidChars(String literal, Pattern pattern)
241    {
242        final int unicodeMatchesCounter =
243                countMatches(sUnicodeRegexp, literal);
244        final int unicodeValidMatchesCouter =
245                countMatches(pattern, literal);
246        return unicodeMatchesCounter - unicodeValidMatchesCouter == 0;
247    }
248
249    /**
250     * Check if trail comment is present after ast token.
251     * @param ast current token.
252     * @return true if trail comment is present after ast token.
253     */
254    private boolean haastrailComment(DetailAST ast)
255    {
256        boolean result = false;
257        final DetailAST variableDef = getVariableDef(ast);
258        DetailAST semi;
259
260        if (variableDef != null) {
261
262            semi = variableDef.getNextSibling();
263
264            if (semi.getType() != TokenTypes.SEMI) {
265                semi = variableDef.getLastChild();
266            }
267        }
268        else {
269            semi = getSemi(ast);
270        }
271
272        if (semi != null) {
273            final int lineNo = semi.getLineNo();
274            final String currentLine = getLine(lineNo - 1);
275
276            if (currentLine != null && sCommentRegexp.matcher(currentLine).find()) {
277                result = true;
278            }
279        }
280
281        return result;
282    }
283
284    /**
285     * Count regexp matchers into String literal.
286     * @param pattern pattern.
287     * @param target String literal.
288     * @return count of regexp matchers.
289     */
290    private int countMatches(Pattern pattern, String target)
291    {
292        int matcherCounter = 0;
293        final Matcher matcher = pattern.matcher(target);
294        while (matcher.find()) {
295            matcherCounter++;
296        }
297        return matcherCounter;
298    }
299
300    /**
301     * Get variable definition.
302     * @param ast current token.
303     * @return variable definition.
304     */
305    private DetailAST getVariableDef(DetailAST ast)
306    {
307        DetailAST result = ast.getParent();
308        while (result != null
309                && result.getType() != TokenTypes.VARIABLE_DEF)
310        {
311            result = result.getParent();
312        }
313        return result;
314    }
315
316    /**
317     * Get semi token.
318     * @param ast current token.
319     * @return semi token or null.
320     */
321    private DetailAST getSemi(DetailAST ast)
322    {
323        DetailAST result = ast.getParent();
324        while (result != null
325                && result.getLastChild().getType() != TokenTypes.SEMI)
326        {
327            result = result.getParent();
328        }
329        if (result != null) {
330            result = result.getLastChild();
331        }
332        return result;
333    }
334
335    /**
336     * Checks if all characters in String literal is escaped.
337     * @param literal current literal.
338     * @return true if all characters in String literal is escaped.
339     */
340    private boolean isAllCharactersEscaped(String literal)
341    {
342        return allowIfAllCharactersEscaped
343                && sAllEscapedChars.matcher(literal.substring(1,
344                        literal.length() - 1)).find();
345    }
346}