Software Engineering for Smart Data Analytics & Smart Data Analytics for Software Engineering
TF_05
import java.io.*; import java.util.*; import java.util.Map.Entry; public class TF_05 { // // The functions // /** * Takes a path to a file and returns the entire contents of the file as a * string */ private static String readFile(String pathToFile) throws IOException { File file = new File(pathToFile); FileInputStream stream = null; stream = new FileInputStream(file); byte[] rawData = new byte[(int) file.length()]; stream.read(rawData); stream.close(); return new String(rawData); } /** * Takes a string and returns a copy with all nonalphanumeric chars replaced * by white space */ private static String filterCharsAndNormalize(String strData) { return strData.replaceAll("[\\W_]+", " ").toLowerCase(); } /** * Takes a string and scans for words, returning a list of words. */ private static List<String> scan(String strData) { return Arrays.asList(strData.split(" ")); } /** * Takes a list of words and returns a copy with all stop words removed */ private static List<String> removeStopWords(List<String> wordList) throws IOException { List<String> stopWords = new ArrayList<String>(); stopWords.addAll(Arrays.asList(readFile("stop_words.txt").split(","))); // add single-letter words for (char c = 'a'; c <= 'z'; c++) { stopWords.add(Character.toString(c)); } ArrayList<String> result = new ArrayList<String>(wordList); result.removeAll(stopWords); return result; } /** * Takes a list of words and returns a dictionary associating words with * frequencies of occurrence */ private static Map<String, Integer> frequencies(List<String> wordList) { Map<String, Integer> wordFreqs = new HashMap<String, Integer>(); for (String w : wordList) { Integer count = wordFreqs.get(w); wordFreqs.put(w, (count == null) ? 1 : count + 1); } return wordFreqs; } /** * Takes a dictionary of words and their frequencies and returns a list of * pairs where the entries are sorted by frequency */ private static List<Entry<String, Integer>> sort(Map<String, Integer> wordFreq) { List<Entry<String, Integer>> result = new ArrayList<Entry<String, Integer>>(); result.addAll(wordFreq.entrySet()); Comparator<Entry<String, Integer>> comparator = new Comparator<Entry<String, Integer>>() { public int compare(Entry<String, Integer> left, Entry<String, Integer> right) { return -left.getValue().compareTo(right.getValue()); } }; Collections.sort(result, comparator); return result; } /** * Takes a list and returns a linked list containing * the first num entries of the original list. */ private static LinkedList<Entry<String, Integer>> first25(List<Entry<String, Integer>> list) { LinkedList<Entry<String, Integer>> result = new LinkedList<Entry<String, Integer>>(); Iterator<Entry<String, Integer>> it = list.iterator(); for (int count = 0; it.hasNext() && (count < 25); count++) { result.add(it.next()); } return result; } /** * Takes a list of pairs where the entries are sorted by frequency and print * them recursively. */ private static void printAll(LinkedList<Entry<String, Integer>> wordFreqs) { if (!wordFreqs.isEmpty()) { // Simulating a non destructive tail operation as in "word_freqs[1:]" LinkedList<Entry<String, Integer>> tail = new LinkedList<Entry<String, Integer>>(wordFreqs); Entry<String, Integer> first = tail.removeFirst(); System.out.println("" + first.getKey() + " - " + first.getValue()); printAll(tail); } } // // The main function // public static void main(String[] args) throws IOException { printAll(first25(sort(frequencies(removeStopWords(scan(filterCharsAndNormalize(readFile(args[0])))))))); } }