001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2014 Oliver Burn 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019package com.puppycrawl.tools.checkstyle.checks; 020 021import java.util.regex.Matcher; 022import java.util.regex.Pattern; 023 024import com.puppycrawl.tools.checkstyle.api.Check; 025import com.puppycrawl.tools.checkstyle.api.DetailAST; 026import com.puppycrawl.tools.checkstyle.api.TokenTypes; 027import com.puppycrawl.tools.checkstyle.api.Utils; 028 029/** 030 * <p> 031 * Restrict using <a href = 032 * "http://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.3"> 033 * Unicode escapes</a> (e.g. \u221e). 034 * It is possible to allow using escapes for 035 * <a href="http://en.wiktionary.org/wiki/Appendix:Control_characters"> 036 * non-printable(control) characters</a>. 037 * Also, this check can be configured to allow using escapes 038 * if trail comment is present. By the option it is possible to 039 * allow using escapes if literal contains only them. By the option it 040 * is possible to allow using escapes for space literals. 041 * </p> 042 * <p> 043 * Examples of using Unicode:</p> 044 * <pre> 045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment. 046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is. 047 * </pre> 048 * <p> 049 * An example of how to configure the check is: 050 * </p> 051 * <pre> 052 * <module name="AvoidEscapedUnicodeCharacters"/> 053 * </pre> 054 * <p> 055 * An example of non-printable(control) characters. 056 * </p> 057 * <pre> 058 * return '\ufeff' + content; // byte order mark 059 * </pre> 060 * <p> 061 * An example of how to configure the check to allow using escapes 062 * for non-printable(control) characters: 063 * </p> 064 * <pre> 065 * <module name="AvoidEscapedUnicodeCharacters"> 066 * <property name="allowEscapesForControlCharacters" value="true"/> 067 * </module> 068 * </pre> 069 * <p> 070 * Example of using escapes with trail comment: 071 * </p> 072 * <pre> 073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 074 * </pre> 075 * <p>An example of how to configure the check to allow using escapes 076 * if trail comment is present: 077 * </p> 078 * <pre> 079 * <module name="AvoidEscapedUnicodeCharacters"> 080 * <property name="allowByTailComment" value="true"/> 081 * </module> 082 * </pre> 083 * <p>Example of using escapes if literal contains only them: 084 * </p> 085 * <pre> 086 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 087 * </pre> 088 * <p>An example of how to configure the check to allow escapes 089 * if literal contains only them: 090 * </p> 091 * <pre> 092 * <module name="AvoidEscapedUnicodeCharacters"> 093 * <property name="allowIfAllCharactersEscaped" value="true"/> 094 * </module> 095 * </pre> 096 * <p>An example of how to configure the check to allow non-printable escapes: 097 * </p> 098 * <pre> 099 * <module name="AvoidEscapedUnicodeCharacters"> 100 * <property name="allowNonPrintableEscapes" value="true"/> 101 * </module> 102 * </pre> 103 * 104 * @author maxvetrenko 105 * 106 */ 107public class AvoidEscapedUnicodeCharactersCheck 108 extends Check 109{ 110 /** Regexp for Unicode chars */ 111 private static Pattern sUnicodeRegexp = 112 Utils.getPattern("\\\\u[a-fA-F0-9]{4}"); 113 114 /** Regexp Unicode control characters */ 115 private static Pattern sUnicodeControl = Utils.getPattern("\\\\(u|U)" 116 + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)" 117 + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]" 118 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 119 120 /** Regexp for trail comment */ 121 private static Pattern sCommentRegexp = Utils.getPattern(";[ ]*//+" 122 + "[a-zA-Z0-9 ]*|;[ ]*/[*]{1}+[a-zA-Z0-9 ]*"); 123 124 /** Regexp for all escaped chars*/ 125 private static Pattern sAllEscapedChars = 126 Utils.getPattern("^((\\\\u)[a-fA-F0-9]{4}" 127 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\\\"|\\\')+$"); 128 129 /** Regexp for non-printable unicode chars*/ 130 private static Pattern sNonPrintableChars = Utils.getPattern("\\\\u1680|\\\\u2028" 131 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 132 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 133 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 134 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 135 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 136 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 137 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 138 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 139 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 140 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 141 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 142 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 143 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 144 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 145 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 146 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 147 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 148 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 149 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 150 151 /** Allow use escapes for non-printable(control) characters. */ 152 private boolean allowEscapesForControlCharacters; 153 154 /** Allow use escapes if trail comment is present*/ 155 private boolean allowByTailComment; 156 157 /** Allow if all characters in literal are excaped*/ 158 private boolean allowIfAllCharactersEscaped; 159 160 /** Allow escapes for space literals*/ 161 private boolean allowNonPrintableEscapes; 162 163 /** 164 * Set allowIfAllCharactersEscaped. 165 * @param allow user's value. 166 */ 167 public final void setAllowEscapesForControlCharacters(boolean allow) 168 { 169 allowEscapesForControlCharacters = allow; 170 } 171 172 /** 173 * Set allowByTailComment. 174 * @param allow user's value. 175 */ 176 public final void setAllowByTailComment(boolean allow) 177 { 178 allowByTailComment = allow; 179 } 180 181 /** 182 * Set allowIfAllCharactersEscaped. 183 * @param allow user's value. 184 */ 185 public final void setAllowIfAllCharactersEscaped(boolean allow) 186 { 187 allowIfAllCharactersEscaped = allow; 188 } 189 190 /** 191 * Set allowSpaceEscapes. 192 * @param allow user's value. 193 */ 194 public final void setAllowNonPrintableEscapes(boolean allow) 195 { 196 allowNonPrintableEscapes = allow; 197 } 198 199 @Override 200 public int[] getDefaultTokens() 201 { 202 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 203 } 204 205 @Override 206 public void visitToken(DetailAST ast) 207 { 208 209 final String literal = ast.getText(); 210 211 if (hasUnicodeChar(literal)) { 212 if (!(allowByTailComment && haastrailComment(ast) 213 || isAllCharactersEscaped(literal) 214 || (allowEscapesForControlCharacters 215 && isOnlyUnicodeValidChars(literal, sUnicodeControl)) 216 || (allowNonPrintableEscapes 217 && isOnlyUnicodeValidChars(literal, sNonPrintableChars)))) 218 { 219 log(ast.getLineNo(), "forbid.escaped.unicode.char"); 220 } 221 } 222 } 223 224 /** 225 * Checks if literal has Unicode chars. 226 * @param literal String literal. 227 * @return true if literal has Unicode chars. 228 */ 229 private boolean hasUnicodeChar(String literal) 230 { 231 return sUnicodeRegexp.matcher(literal).find(); 232 } 233 234 /** 235 * Check if String literal contains Unicode control chars. 236 * @param literal String llteral. 237 * @param pattern RegExp for valid characters. 238 * @return true, if String literal contains Unicode control chars. 239 */ 240 private boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) 241 { 242 final int unicodeMatchesCounter = 243 countMatches(sUnicodeRegexp, literal); 244 final int unicodeValidMatchesCouter = 245 countMatches(pattern, literal); 246 return unicodeMatchesCounter - unicodeValidMatchesCouter == 0; 247 } 248 249 /** 250 * Check if trail comment is present after ast token. 251 * @param ast current token. 252 * @return true if trail comment is present after ast token. 253 */ 254 private boolean haastrailComment(DetailAST ast) 255 { 256 boolean result = false; 257 final DetailAST variableDef = getVariableDef(ast); 258 DetailAST semi; 259 260 if (variableDef != null) { 261 262 semi = variableDef.getNextSibling(); 263 264 if (semi.getType() != TokenTypes.SEMI) { 265 semi = variableDef.getLastChild(); 266 } 267 } 268 else { 269 semi = getSemi(ast); 270 } 271 272 if (semi != null) { 273 final int lineNo = semi.getLineNo(); 274 final String currentLine = getLine(lineNo - 1); 275 276 if (currentLine != null && sCommentRegexp.matcher(currentLine).find()) { 277 result = true; 278 } 279 } 280 281 return result; 282 } 283 284 /** 285 * Count regexp matchers into String literal. 286 * @param pattern pattern. 287 * @param target String literal. 288 * @return count of regexp matchers. 289 */ 290 private int countMatches(Pattern pattern, String target) 291 { 292 int matcherCounter = 0; 293 final Matcher matcher = pattern.matcher(target); 294 while (matcher.find()) { 295 matcherCounter++; 296 } 297 return matcherCounter; 298 } 299 300 /** 301 * Get variable definition. 302 * @param ast current token. 303 * @return variable definition. 304 */ 305 private DetailAST getVariableDef(DetailAST ast) 306 { 307 DetailAST result = ast.getParent(); 308 while (result != null 309 && result.getType() != TokenTypes.VARIABLE_DEF) 310 { 311 result = result.getParent(); 312 } 313 return result; 314 } 315 316 /** 317 * Get semi token. 318 * @param ast current token. 319 * @return semi token or null. 320 */ 321 private DetailAST getSemi(DetailAST ast) 322 { 323 DetailAST result = ast.getParent(); 324 while (result != null 325 && result.getLastChild().getType() != TokenTypes.SEMI) 326 { 327 result = result.getParent(); 328 } 329 if (result != null) { 330 result = result.getLastChild(); 331 } 332 return result; 333 } 334 335 /** 336 * Checks if all characters in String literal is escaped. 337 * @param literal current literal. 338 * @return true if all characters in String literal is escaped. 339 */ 340 private boolean isAllCharactersEscaped(String literal) 341 { 342 return allowIfAllCharactersEscaped 343 && sAllEscapedChars.matcher(literal.substring(1, 344 literal.length() - 1)).find(); 345 } 346}