Cleaning Data in Java
Dennis Lee
Software Engineer
d-MMM-yy
// We extracted these dates from our dataset
List<String> dates = Arrays.asList("4-Jan-22", "1-Aug-22", "7-Jul-22",
"27-Apr-22", "24/02/22");
d-MMM-yy
formatd-MMM-yy
(like "4-Jan-22")\\d{1,2}-[A-Za-z]{3}-\\d{2}
\\d{1,2} // 1-2 digits for day (like 4 or 27) - // literal hyphen separator
[A-Za-z]{3} // exactly 3 letters for month like Jan; uppercase or lowercase - // literal hyphen separator
\\d{2} // exactly 2 digits for year (like 22)
import java.util.regex.Pattern;
// Match d-MMM-yy Pattern datePattern = Pattern.compile("\\d{1,2}-[A-Za-z]{3}-\\d{2}");
for (String date : dates) { // Check if date matches datePattern boolean isValid = datePattern.matcher(date).matches(); System.out.println(date + " is valid: " + isValid); }
4-Jan-22 is valid: true
1-Aug-22 is valid: true
7-Jul-22 is valid: true
27-Apr-22 is valid: true
24/02/22 is valid: false
import java.util.regex.Matcher;
Pattern monthPattern = Pattern.compile("[A-Za-z]{3}"); // 3 letters like "Jan"
for (String date : dates) { Matcher matcher = monthPattern.matcher(date); // Is monthPattern in date?
if (matcher.find()) { // Returns true if the pattern is found
// matcher.group() returns the matched content System.out.println("Month found: " + matcher.group()); } else { System.out.println("No month found in: " + date); } }
Month found: Jan
Month found: Aug
Month found: Jul
Month found: Apr
No month found in: 24/02/22
import com.google.common.base.CharMatcher;
for (String date : dates) { boolean isValid = // Check for digits in date CharMatcher.inRange('0', '9').matchesAnyOf(date) &&
// Check for letters (uppercase and lowercase) in date CharMatcher.inRange('a', 'z') .or(CharMatcher.inRange('A', 'Z')).matchesAnyOf(date) &&
CharMatcher.is('-').matchesAnyOf(date); // Check for hyphens in date System.out.println(date + " has valid characters: " + isValid); }
4-Jan-22 has valid characters: true
1-Aug-22 has valid characters: true
7-Jul-22 has valid characters: true
27-Apr-22 has valid characters: true
24/02/22 has valid characters: false
// Define pattern d-MMM-yy, like 4-Jan-22
Pattern datePattern = Pattern.compile("\\d{1,2}-[A-Za-z]{3}-\\d{2}");
// Full pattern matching (does date match datePattern?)
boolean matchesPattern = datePattern.matcher(date).matches();
// Pattern finding (does date have a month substring?)
boolean hasMonth = Pattern.compile("[A-Za-z]{3}").matcher(date).find();
// Character matching (does date have digits?)
boolean hasDigits = CharMatcher.digit().matchesAnyOf(date);
Cleaning Data in Java