SDA SE Wiki

Software Engineering for Smart Data Analytics & Smart Data Analytics for Software Engineering

User Tools

Site Tools


Pipeline Style

  • This version tries to be as close as possible to the Python version.

TF_05

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

public class TF_05 {

	//
	// The functions
	//

	/**
	 * Takes a path to a file and returns the entire contents of the file as a
	 * string
	 */
	private static String readFile(String pathToFile) throws IOException {
		File file = new File(pathToFile);
		FileInputStream stream = null;
		stream = new FileInputStream(file);
		byte[] rawData = new byte[(int) file.length()];
		stream.read(rawData);
		stream.close();
		return new String(rawData);
	}
	
	/**
	 * Takes a string and returns a copy with all nonalphanumeric chars replaced
	 * by white space
	 */
	private static String filterCharsAndNormalize(String strData) {
		return strData.replaceAll("[\\W_]+", " ").toLowerCase();
	}

	/**
	 * Takes a string and scans for words, returning a list of words.
	 */
	private static List<String> scan(String strData) {
		return Arrays.asList(strData.split(" "));
	}	

	/**
	 * Takes a list of words and returns a copy with all stop words removed
	 */
	private static List<String> removeStopWords(List<String> wordList) throws IOException {
	        List<String> stopWords = new ArrayList<String>();
	        stopWords.addAll(Arrays.asList(readFile("stop_words.txt").split(",")));
		// add single-letter words
		for (char c = 'a'; c <= 'z'; c++) {
			stopWords.add(Character.toString(c));
		}
		ArrayList<String> result = new ArrayList<String>(wordList);
		result.removeAll(stopWords);
		return result;
	}

	/**
	 * Takes a list of words and returns a dictionary associating words with
	 * frequencies of occurrence
	 */
	private static Map<String, Integer> frequencies(List<String> wordList) {
		Map<String, Integer> wordFreqs = new HashMap<String, Integer>();
		for (String w : wordList) {
			Integer count = wordFreqs.get(w);
			wordFreqs.put(w, (count == null) ? 1 : count + 1);
		}
		return wordFreqs;
	}
	
	/**
	 * Takes a dictionary of words and their frequencies and returns a list of
	 * pairs where the entries are sorted by frequency
	 */
	private static List<Entry<String, Integer>> sort(Map<String, Integer> wordFreq) {
		List<Entry<String, Integer>> result = new ArrayList<Entry<String, Integer>>();
		result.addAll(wordFreq.entrySet());
		Comparator<Entry<String, Integer>> comparator = new Comparator<Entry<String, Integer>>() {
			public int compare(Entry<String, Integer> left, Entry<String, Integer> right) {
				return -left.getValue().compareTo(right.getValue());
			}
		};
		Collections.sort(result, comparator);
		return result;
	}

	/**
	 * Takes a list and returns a linked list containing
	 * the first num entries of the original list.
	 */
	private static LinkedList<Entry<String, Integer>> first25(List<Entry<String, Integer>> list) {
		LinkedList<Entry<String, Integer>> result = new LinkedList<Entry<String, Integer>>();
		Iterator<Entry<String, Integer>> it = list.iterator();
		for (int count = 0; it.hasNext() && (count < 25); count++) {
			result.add(it.next());
		}
		return result;
	}

	/**
	 * Takes a list of pairs where the entries are sorted by frequency and print
	 * them recursively.
	 */
	private static void printAll(LinkedList<Entry<String, Integer>> wordFreqs) {
		if (!wordFreqs.isEmpty()) {
			// Simulating a non destructive tail operation as in "word_freqs[1:]"
			LinkedList<Entry<String, Integer>> tail = new LinkedList<Entry<String, Integer>>(wordFreqs);
			Entry<String, Integer> first = tail.removeFirst();
			System.out.println("" + first.getKey() + " - " + first.getValue());
			printAll(tail);
		}
	}
	
	//
	// The main function
	//
	public static void main(String[] args) throws IOException {
		printAll(first25(sort(frequencies(removeStopWords(scan(filterCharsAndNormalize(readFile(args[0]))))))));
	}

}
teaching/seminars/style/2014/pipeline_style.txt · Last modified: 2018/05/24 15:09 by Daniel Speicher

SEWiki, © 2021