资源简介

处理中文地址的分词和匹配 采用混合分词算法进行中文地址分词 在中文地址分词基础上采用Double Levenshetin算法进行中文地址相似度进行地址匹配

资源截图

代码片段和文件信息

package experiment;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.HbaseConfiguration;

import com.AddressSegment.data.dao.impl.AddressQueryImpl;
import com.AddressSegment.logic.AddressSplitImpl;
import com.AddressSegment.logic.UndefinedWordRecognize;
import com.AddressSegment.metadata.model.CharDictionary;
import com.AddressSegment.metadata.model.WordDictionary;
import com.AddressSegment.tool.dao.impl.DictionaryFileOperationDAOImpl;
import com.AddressSegment.util.Config;

public class CountAddress {
public static Configuration config = null;
public static FileSystem fs = null;
public static DictionaryFileOperationDAOImpl DF = null;
public static WordDictionary wordDict = null;
public static CharDictionary charDict = null;
public static int rowkey = 0;

// public static HTablePool pool = new HTablePool(config 1000);

static {
config = HbaseConfiguration.create();
wordDict = new WordDictionary();
charDict = new CharDictionary();
Configuration conf = new Configuration();
try {
fs = FileSystem.get(URI.create(“hdfs://192.168.31.172:9000“) conf);
} catch (IOException e1) {
e1.printStackTrace();
}
try {
DF = new DictionaryFileOperationDAOImpl(Config.getDefaultDictionaryHDFSURL()
Config.getCharDictionaryHDFSURL() fs);
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
DF.putFileToDict(wordDict charDict);
}

public static void ComputeAddressCount(String fileInputPath String fileOutputPath) throws IOException URISyntaxException{
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileInputPath)“GBK“));
String line = ““;
FileOutputStream out=new FileOutputStream(fileOutputPath);
PrintStream p=new PrintStream(out);
while ((line = br.readLine()) != null) {
Integer count = 0;

AddressSplitImpl asi = new AddressSplitImpl();
ArrayList strArray = asi.Split(line fs);
UndefinedWordRecognize uwr = new UndefinedWordRecognize();
ArrayList wordArray1 = uwr.getUndefinedWord(strArray);
AddressQueryImpl aqi = new AddressQueryImpl();
count = aqi.queryAddressCount(wordArray1);

System.out.println(line);
System.out.println(count);
p.println(line+“\t“+count);
}
p.close();
br.close();
}

public static void main(String[] args) throws IOException URISyntaxException {
ComputeAddressCount(“C:/Users/HYFrank/Desktop/Noname1.txt“ “C:/Users/HYFrank/Desktop/countAddress.txt“);
}

}

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件       5930  2016-08-09 22:51  src\com\AddressSegment\data\dao\baseJdbcTemplate.java

     文件        411  2016-08-01 11:36  src\com\AddressSegment\data\dao\declare\AddressQuery.java

     文件        238  2016-01-31 15:22  src\com\AddressSegment\data\dao\declare\SegmentInsert.java

     文件       5207  2016-08-09 22:24  src\com\AddressSegment\data\dao\impl\AddressQueryImpl.java

     文件       1480  2016-02-25 14:56  src\com\AddressSegment\data\dao\impl\SegmentInsertImpl.java

     文件        212  2016-01-29 23:28  src\com\AddressSegment\data\dao\ModelRowMapper.java

     文件        814  2016-02-25 23:13  src\com\AddressSegment\logic\AddressEncodingService.java

     文件       1020  2016-04-05 00:21  src\com\AddressSegment\logic\AddressSplitImpl.java

     文件       3924  2016-08-30 00:37  src\com\AddressSegment\logic\AlgorithmDaoImpl.java

     文件       3620  2016-02-26 00:36  src\com\AddressSegment\logic\GaodeEncodingServiceInvoker.java

     文件        283  2016-02-25 17:00  src\com\AddressSegment\logic\service\AddressEncoding.java

     文件        621  2016-04-05 00:20  src\com\AddressSegment\logic\service\AddressSplit.java

     文件       1201  2016-04-12 21:53  src\com\AddressSegment\logic\service\AddressTageMaking.java

     文件        219  2016-01-25 21:49  src\com\AddressSegment\logic\service\AlgorithmInterface.java

     文件       3418  2016-03-05 14:01  src\com\AddressSegment\logic\service\HttpRequestTemplate.java

     文件        258  2016-07-24 23:35  src\com\AddressSegment\logic\service\IHttpResponseHandler.java

     文件        271  2016-07-24 23:35  src\com\AddressSegment\logic\service\UndefinedWordRecognizeInterface.java

     文件       4771  2016-07-22 09:49  src\com\AddressSegment\logic\UndefinedWordRecognize.java

     文件       5195  2016-04-12 23:04  src\com\AddressSegment\main\AddressRegexTage.java

     文件       4395  2016-04-09 01:07  src\com\AddressSegment\main\AddressSegment.java

     文件       5349  2016-07-07 21:17  src\com\AddressSegment\main\AddressSegmentTage.java

     文件       6496  2016-07-20 14:06  src\com\AddressSegment\main\AddressSegmentToHbase.java

     文件       5976  2016-07-24 23:26  src\com\AddressSegment\main\AddressSegmentToHDFS.java

     文件       6047  2016-07-27 00:45  src\com\AddressSegment\main\AddressSegmentToHDFSToHbase.java

     文件       2661  2016-04-07 23:20  src\com\AddressSegment\main\WordCount.java

     文件        719  2016-01-24 00:56  src\com\AddressSegment\metadata\model\Algorithm.java

     文件       1985  2016-01-23 16:59  src\com\AddressSegment\metadata\model\baseModel.java

     文件        836  2016-01-23 16:59  src\com\AddressSegment\metadata\model\CharDictionary.java

     文件        407  2016-02-25 16:59  src\com\AddressSegment\metadata\model\CoordinateCode.java

     文件       1179  2016-01-30 22:38  src\com\AddressSegment\metadata\model\Segment.java

............此处省略226个文件信息

评论

共有 条评论

相关资源