1. 형태소 분석기 라이브러리 다운로드(다운로드 후 압축푸세요)
형태소 분석기 라이브러리
2. 형태소 분석기 모델 다운로드
형태소 분석기 모델
3. 1번 라이브러리파일을 압축 풀고 프로젝트 lib 폴더에 복사(lib 폴더를 먼저 만드세요)
4. 빌드패스 설정
프로젝트선택하고 마우스 오른쪽 버튼 -> Build Path -> Configure Build Path -> Add Jars...
5. 소스코드에서 아래 Komoran 생성자 경로를 2번 모델을 압축 풀 디렉토리로 수정
Komoran komoran = new Komoran("D:/Projects/komoran/models-full");
6. 소스코드
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import kr.co.shineware.nlp.komoran.core.analyzer.Komoran;
import kr.co.shineware.util.common.model.Pair;
public class RssAnalyticsExample {
public static void main(String[] args) {
Komoran komoran = new Komoran("D:/Projects/komoran/models-full");
URL url = null;
InputStream is = null;
BufferedReader br = null;
ArrayList<Word> wordList = new ArrayList<>();
try {
url = new URL("http://rss.hankooki.com/daily/dh_main.xml");
is = url.openStream();
br = new BufferedReader(new InputStreamReader(is, "EUC-KR"));
int readCount = -1;
char[] buffer = new char[1000];
while( (readCount=br.read(buffer))!= -1) {
String str = new String(buffer, 0, readCount);
str = str.replaceAll("<pubDate>.*</pubDate>", "");
str = str.replaceAll("...CDATA.", "");
str = str.replaceAll("]]>", "");
str = str.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); //태그 제거
// System.out.println(str);
StringTokenizer tokens = new StringTokenizer(str, " \t\n\r,.'\"-=%…()[]{}“▷+ⓒ");
// System.out.println(tokens);
while(tokens.hasMoreTokens()) {
String token = tokens.nextToken();
List<List<Pair<String,String>>> result = komoran.analyze(token);
List<Pair<String,String>> pairs = (List<Pair<String,String>>)result.get(0);
Pair<String, String> pair = (Pair<String, String>)pairs.get(0);
if(pair.getSecond().equals("NNP")) {
// System.out.println(pair.getFirst());
token = pair.getFirst();
}
boolean isTokenExist = false;
for(int i=0; i<wordList.size(); i++) {
if(token.equals(wordList.get(i).getWord())) {
wordList.get(i).setCount(wordList.get(i).getCount()+1);
isTokenExist = true;
}
}
if(!isTokenExist) {
wordList.add(new Word(token));
}
}
}
Collections.sort(wordList);
for(Word word : wordList) {
System.out.println(word);
}
} catch (MalformedURLException e) {
System.out.println("잘못된 주소입니다.");
} catch (IOException e) {
System.out.println("입력 스트림을 열 수 없습니다.");
} finally {
}
}
}
class Word implements Comparable<Word> {
private String word;
private int count;
public Word(String word) {
super();
this.word = word;
this.count = 1;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((word == null) ? 0 : word.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Word other = (Word) obj;
if (word == null) {
if (other.word != null)
return false;
} else if (!word.equals(other.word))
return false;
return true;
}
@Override
public String toString() {
return "Word [word=" + word + ", count=" + count + "]";
}
@Override
public int compareTo(Word o) {
return this.count - o.getCount();
}
}
|