import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
public class Article {
//保存文章的內容
String content;
//保存分割後的單詞集合
String[] rawWords;
//保存統計後的單詞集合
String[] words;
//保存單詞對應的詞頻
int[] wordFreqs;
//構造函數,輸入文章內容
//提高部分:從文件中讀取
public Article() {
content = "kolya is one of the richest films i've seen in some time . zdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) , who finds his life as a czech cellist increasingly impacted by the five-year old boy that he's taking care of . though it ends rather abruptly-- and i'm whining , 'cause i wanted to spend more time with these characters-- the acting , writing , and production values are as high as , if not higher than , comparable american dramas . this father-and-son delight-- sverak also wrote the script , while his son , jan , directed-- won a golden globe for best foreign language film and , a couple days after i saw it , walked away an oscar . in czech and russian , with english subtitles . ";
}
//對文章根據分隔符進行分詞,將結果保存到rawWords數組中
public void splitWord(){
//分詞的時候,因為標點符號不參與,所以所有的符號全部替換為空格
final char SPACE = ' ';
content = content.replace('\'', SPACE).replace(',', SPACE).replace('.', SPACE);
content = content.replace('(', SPACE).replace(')', SPACE).replace('-', SPACE);
rawWords = content.split("\\s+");//凡是空格隔開的都算單詞,上面替換了', 所以I've 被分成2個 //單詞
}
//統計詞,遍歷數組
public void countWordFreq() {
//將所有出現的字符串放入唯壹的set中,不用map,是因為map尋找效率太低了
Set<String> set = new TreeSet<String>();
for(String word: rawWords){
set.add(word);
}
Iterator ite = set.iterator();
List<String> wordsList = new ArrayList<String>();
List<Integer> freqList = new ArrayList<Integer>();
//多少個字符串未知,所以用list來保存先
while(ite.hasNext()){
String word = (String) ite.next();
int count = 0;//統計相同字符串的個數
for(String str: rawWords){
if(str.equals(word)){
count++;
}
}
wordsList.add(word);
freqList.add(count++);
}
//存入數組當中
words = wordsList.toArray(new String[0]);
wordFreqs = new int[freqList.size()];
for(int i = 0; i < freqList.size(); i++){
wordFreqs[i] = freqList.get(i);
}
}
//根據詞頻,將詞數組和詞頻數組進行降序排序
public void sort() {
class Word{
private String word;
private int freq;
public Word(String word, int freq){
this.word = word;
this.freq = freq;
}
}
//註意:此處排序,1)首先按照詞頻降序排列, 2)如果詞頻相同,按照字母降序排列,
//如 'abc' > 'ab' >'aa'
class WordComparator implements Comparator{
public int compare(Object o1, Object o2) {
Word word1 = (Word) o1;
Word word2 = (Word) o2;
if(word1.freq < word2.freq){
return 1;
}else if(word1.freq > word2.freq){
return -1;
}else{
int len1 = word1.word.trim().length();
int len2 = word2.word.trim().length();
String min = len1 > len2? word2.word: word1.word;
String max = len1 > len2? word1.word: word2.word;
for(int i = 0; i < min.length(); i++){
if(min.charAt(i) < max.charAt(i)){
return 1;
}
}
return 1;
}
}
}
List wordList = new ArrayList<Word>();
for(int i = 0; i < words.length; i++){
wordList.add(new Word(words[i], wordFreqs[i]));
}
Collections.sort(wordList, new WordComparator());
for(int i = 0; i < wordList.size(); i++){
Word wor = (Word) wordList.get(i);
words[i] = wor.word;
wordFreqs[i] = wor.freq;
}
}
//將排序結果輸出
public void printResult() {
System.out.println("Total " + words.length + " different words in the content!");
for(int i = 0; i < words.length; i++){
System.out.println(wordFreqs[i] + " " + words[i]);
}
}
//測試類的功能
public static void main(String[] args) {
Article a = new Article();
a.splitWord();
a.countWordFreq();
a.sort();
a.printResult();
}
}
-----------------------
Total 99 different words in the content!
5 and
4 the
4 i
4 a
3 as
2 with
2 who
2 to
2 time
2 sverak
2 son
2 s
2 old
2 of
2 it
2 in
2 his
2 czech
1 zdenek
1 year
1 wrote
1 writing
1 won
1 whining
1 while
1 wanted
1 walked
1 ve
1 values
1 though
1 this
1 these
1 that
1 than
1 taking
1 subtitles
1 spend
1 some
1 so
1 seen
1 script
1 saw
1 russian
1 richest
1 remain
1 rather
1 production
1 plays
1 oscar
1 one
1 not
1 more
1 m
1 likely
1 life
1 language
1 kolya
1 jan
1 is
1 increasingly
1 impacted
1 if
1 higher
1 high
1 he
1 golden
1 globe
1 foreign
1 for
1 five
1 finds
1 films
1 film
1 father
1 english
1 ends
1 dramas
1 directed
1 delight
1 days
1 couple
1 confirmed
1 comparable
1 characters
1 cellist
1 cause
1 care
1 by
1 boy
1 best
1 bachelor
1 away
1 are
1 an
1 american
1 also
1 after
1 acting
1 abruptly