내용

글번호 630
작성자 heojk
작성일 2017-04-17 12:46:53
제목 csv 파일로 저장
내용
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
 
import kr.co.shineware.nlp.komoran.core.analyzer.Komoran;
import kr.co.shineware.util.common.model.Pair;
 
public class RssAnalyticsExample {
  public static void main(String[] args) {
    Komoran komoran = new Komoran("D:/Projects/komoran/models-full");
     
    URL url = null;
    InputStream is = null;
    BufferedReader br = null;
    ArrayList<Word> wordList = new ArrayList<>();
    try {
      url = new URL("http://rss.hankooki.com/daily/dh_main.xml");
      is = url.openStream();
      br = new BufferedReader(new InputStreamReader(is, "EUC-KR"));
      int readCount = -1;
      char[] buffer = new char[1000];
       
      while( (readCount=br.read(buffer))!= -1) {
        String str = new String(buffer, 0, readCount);
        str = str.replaceAll("<pubDate>.*</pubDate>", "");
        str = str.replaceAll("...CDATA.", "");
        str = str.replaceAll(">>", "");
        str = str.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); //태그 제거
//        System.out.println(str);
        StringTokenizer tokens = new StringTokenizer(str, " \t\n\r,.·'\"-=%…()[]{}“‘▷+ⓒ");
//        System.out.println(tokens);
        while(tokens.hasMoreTokens()) {
          String token = tokens.nextToken();
           
          List<List<Pair<String,String>>> result = komoran.analyze(token);
          List<Pair<String,String>> pairs = (List<Pair<String,String>>)result.get(0);
          Pair<String, String> pair = (Pair<String, String>)pairs.get(0);
          if(pair.getSecond().equals("NNP")) {
//            System.out.println(pair.getFirst());
            token = pair.getFirst();
          }
          boolean isTokenExist = false;
          for(int i=0; i<wordList.size(); i++) {
            if(token.equals(wordList.get(i).getWord())) {
              wordList.get(i).setCount(wordList.get(i).getCount()+1);
              isTokenExist = true;
            }
          }
		List<Word> subList = wordList.subList(0, 30);
		FileOutputStream fos = new FileOutputStream("wordCount.csv");
		BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos, "EUC-KR"));
		
		for(Word word : subList) {
			System.out.println(word);
			bw.write(word.getWord() + "," + word.getCount() + "\n");
		}
		bw.close();
        }
      }
      Collections.sort(wordList);
      for(Word word : wordList) {
        System.out.println(word);
      }
    } catch (MalformedURLException e) {
      System.out.println("잘못된 주소입니다.");
    } catch (IOException e) {
      System.out.println("입력 스트림을 열 수 없습니다.");
    } finally {
       
    }
  }
}
class Word implements Comparable<Word> {
  private String word;
  private int count;
   
  public Word(String word) {
    super();
    this.word = word;
    this.count = 1;
  }
  public String getWord() {
    return word;
  }
  public void setWord(String word) {
    this.word = word;
  }
  public int getCount() {
    return count;
  }
  public void setCount(int count) {
    this.count = count;
  }
   
  @Override
  public int hashCode() {
    final int prime = 31;
    int result = 1;
    result = prime * result + ((word == null) ? 0 : word.hashCode());
    return result;
  }
  @Override
  public boolean equals(Object obj) {
    if (this == obj)
      return true;
    if (obj == null)
      return false;
    if (getClass() != obj.getClass())
      return false;
    Word other = (Word) obj;
    if (word == null) {
      if (other.word != null)
        return false;
    } else if (!word.equals(other.word))
      return false;
    return true;
  }
  @Override
  public String toString() {
    return "Word [word=" + word + ", count=" + count + "]";
  }
  @Override
  public int compareTo(Word o) {
    return -(this.count - o.getCount());
  }
}