Software Engineering for Smart Data Analytics & Smart Data Analytics for Software Engineering
package plain; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.Set; public class Main { private static void printTermFrequencies(String pathToFile) throws IOException { Set<String> stopWords = new HashSet<String>(); // Read the stop words from file FileReader fileReader = new FileReader("stop_words.txt"); BufferedReader reader = new BufferedReader(fileReader); for (String stopWord : reader.readLine().split(",")) { stopWords.add(stopWord); } reader.close(); // Add single characters as stop words for (char c = 'a'; c <= 'z'; c++) { stopWords.add(Character.toString(c)); } Map<String, Integer> wordFrequencies = new HashMap<String, Integer>(); // Count the regular words in the file Scanner scanner = new Scanner(new File(pathToFile)); scanner.useDelimiter("[\\W_]+"); while (scanner.hasNext()) { String word = scanner.next().toLowerCase(); if (!stopWords.contains(word)) { Integer count = wordFrequencies.get(word); wordFrequencies.put(word, (count == null) ? 1 : count + 1); } } scanner.close(); // Sort the words and their frequency by frequency in descending order List<Entry<String, Integer>> orderedFrequencies = new ArrayList<Entry<String, Integer>>(); orderedFrequencies.addAll(wordFrequencies.entrySet()); Comparator<Entry<String, Integer>> comparator = new Comparator<Entry<String, Integer>>() { public int compare(Entry<String, Integer> left, Entry<String, Integer> right) { return -left.getValue().compareTo(right.getValue()); } }; Collections.sort(orderedFrequencies, comparator); // Print the first (most frequent) 25 words with their frequency Iterator<Entry<String, Integer>> it = orderedFrequencies.iterator(); for (int printed = 0; it.hasNext() && (printed < 25); printed++) { Entry<String, Integer> wordFrequency = it.next(); System.out.println("" + wordFrequency.getKey() + " - " + wordFrequency.getValue()); } } public static void main(String[] args) { try { printTermFrequencies(args[0]); } catch (IOException e) { System.out.println(e); } } }