Home » , , » 哈工大分词java接口

哈工大分词java接口

package IRdll;

import java.io.File;
import java.io.Reader;
import java.io.FileInputStream;
import java.io.*;
import java.util.Date;
import java.util.HashSet;
/**
* Title: Java中文分词接口

* Description: 本组件以哈工大分词系统为基础,在其基础之上开发
* 本组件仅供学习和研究用途,任何商业用途将自行承担法律后果,与组件编写人无关。

* Copyright: Copyright (c) 2006

* Company: dalian univercity of techology

* @author :yezheng
* @version 1.0
*/
public class IRSplit {
private static IRSplit instance = null; //instance时类中一成员,所以可以访问其中被private修饰的变量或方法
private static StringBuffer stringb = new StringBuffer(102400);
private static StringBuffer longSentence = new StringBuffer(10240);
//私有构造方法
private IRSplit() {
System.out.println(“正在加载词典……”);
this.LoadSegRes();
System.out.println(“加载结束”);
}
//获得一个实例
public static IRSplit getInstance() {
if (instance == null) {
instance = new IRSplit();
}
return instance;
}
//本地方法
private native void LoadSegRes();
private native void ReleaseSegger();
private native String split(String sentence);
//对一句话分词
public String splitSentence(String sentence) {
if (sentence.length() < 1 || sentence == null)
return “”;
else
return split(sentence);
}
public String splitLongSentence(String sentence) {
if (sentence.length() < 1 || sentence == null)
return “”;
else
{
this.longSentence.setLength(0);
int start = 0 ; int  end = 0;
for (int i = 0; i < sentence.length(); i++)
{
char c = sentence.charAt(i);
switch (Character.getType(c))
{
case 24:
end++;
//System.out.println(sentence.substring(start, end));
this.longSentence.append(split(sentence.substring(start, end)));
start = end;
break;
default:
end++;
break;
}
}
if(start < end)
{
longSentence.append(split(sentence.substring(start, end)));
}
return longSentence.toString();
}
}
public void ReleaseSeggers() {
instance = null;
ReleaseSegger();
}
//对一个文件分词
public void splitFile(File file, File outfile) {
try {
FileInputStream fis = new FileInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
this.stringb.setLength(0);
String ts;
while ( (ts = br.readLine()) != null) {
if (ts.length() != 0) {
stringb.append(splitLongSentence(ts) + ‘r’ + ‘n’); //进行分词
}
else {
stringb.append(‘r’);
stringb.append(‘n’);
}
}
br.close();
fis.close();
FileWriter writer = new FileWriter(outfile);
writer.write(stringb.toString());
writer.close();
}
catch (FileNotFoundException ex) {
System.out.println(file.toString() + “File not Found”);
}
catch (IOException ex1) {
System.out.println(file.toString() + “IO errors”);
}
}
public void splitFile(String source, String destination) {
File file = new File(source);
File outfile = new File(destination);
if (file.isFile()) {
splitFile(file, outfile);
}
}
public Reader splitFile(Reader reader) {
BufferedReader br = new BufferedReader(reader);
StringBuffer stringb = new StringBuffer();
try {
String ts;
while ( (ts = br.readLine()) != null) {
if (ts.length() != 0) {
stringb.append(splitSentence(ts) + ‘r’ + ‘n’); //进行分词
}
else {
stringb.append(‘r’);
stringb.append(‘n’);
}
}
reader = new StringReader(stringb.toString());
}
catch (IOException ex) {
}
return reader;
}
//处理一个目录下的所有文件
public void splitFiles(String sourceDir, String destinationDir) { //参数:源文件目录和目标文件目录
File directory = new File(sourceDir);
File dirdes = new File(destinationDir);
//FilenameFilter txtFilter = new myFilter(“txt”);
File files[] = directory.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isFile()) {
File outfile = new File(destinationDir + “/” +
files[i].getName());
//System.out.println(directory.getName() + “:” + dirdes.getName());
splitFile(files[i], outfile);
}
else if (files[i].isDirectory()) {
File tempdir = new File(destinationDir + “/” + files[i].getName());
if (!tempdir.exists() || !tempdir.isDirectory()) {
tempdir.mkdir();
}
splitFiles(sourceDir + “/” + files[i].getName(),
tempdir.getAbsolutePath());
}
}
}
static {
System.loadLibrary(“IRdll”);
}
public static void main(String[] args) {
IRSplit split = IRSplit.getInstance(); //其它类使用
long start = System.currentTimeMillis();
Date startdate = new Date();
//split.splitFiles(“clean”, “out”);
split.splitLongSentence(ss);
Date enddate = new Date();
System.out.println(startdate);
System.out.println(enddate);
System.out.println(enddate.getTime()- startdate.getTime());
}
}

3 Comments:

sri said...

Thanks for sharing this informative content.,
Turient is an All-in-one platform for all our teaching needs. If Teaching is your passion ,enabling is ours
Read the Informative blog - 11 Free Teaching Tools for Online Teachers

11 Free Teaching Tools for Online Teachers
Free Teaching Tools

tosswise said...

this is an informative post and it is very beneficial and knowledgeable. word hurdle

jane robert said...

I wanted to take a moment to express my gratitude for your helpful post. Your information is reliable and well-researched, and I found it to be a valuable resource. Thank you so much for taking the time to share your knowledge with us - your efforts are greatly appreciated
Indian divorce lawyer New York

Popular Posts