SDA SE Wiki

Software Engineering for Smart Data Analytics & Smart Data Analytics for Software Engineering

User Tools

Site Tools


Term Frequency

package plain;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.Set;

public class Main {

  private static void printTermFrequencies(String pathToFile) throws IOException {

    Set<String> stopWords = new HashSet<String>();

    // Read the stop words from file
    FileReader fileReader = new FileReader("stop_words.txt");
    BufferedReader reader = new BufferedReader(fileReader);
    for (String stopWord : reader.readLine().split(",")) {
      stopWords.add(stopWord);
    }
    reader.close();

    // Add single characters as stop words
    for (char c = 'a'; c <= 'z'; c++) {
      stopWords.add(Character.toString(c));
    }

    Map<String, Integer> wordFrequencies = new HashMap<String, Integer>();

    // Count the regular words in the file
    Scanner scanner = new Scanner(new File(pathToFile));
    scanner.useDelimiter("[\\W_]+");
    while (scanner.hasNext()) {
      String word = scanner.next().toLowerCase();
      if (!stopWords.contains(word)) {
        Integer count = wordFrequencies.get(word);
        wordFrequencies.put(word, (count == null) ? 1 : count + 1);
      }
    }
    scanner.close();

    // Sort the words and their frequency by frequency in descending order  
    List<Entry<String, Integer>> orderedFrequencies = new ArrayList<Entry<String, Integer>>();
    orderedFrequencies.addAll(wordFrequencies.entrySet());
    Comparator<Entry<String, Integer>> comparator = new Comparator<Entry<String, Integer>>() {
      public int compare(Entry<String, Integer> left, Entry<String, Integer> right) {
        return -left.getValue().compareTo(right.getValue());
      }
    };
    Collections.sort(orderedFrequencies, comparator);

    // Print the first (most frequent) 25 words with their frequency
    Iterator<Entry<String, Integer>> it = orderedFrequencies.iterator();
    for (int printed = 0; it.hasNext() && (printed < 25); printed++) {
      Entry<String, Integer> wordFrequency = it.next();
      System.out.println("" + wordFrequency.getKey() + " - " + wordFrequency.getValue());
    }
  }

  public static void main(String[] args) {
    try {
      printTermFrequencies(args[0]);
    } catch (IOException e) {
      System.out.println(e);
    }
  }

}


teaching/seminars/style/2014/plain.txt · Last modified: 2018/05/09 01:59 (external edit)

SEWiki, © 2021