本文整理汇总了Java中edu.berkeley.nlp.lm.io.LmReaders类的典型用法代码示例。如果您正苦于以下问题:Java LmReaders类的具体用法?Java LmReaders怎么用?Java LmReaders使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
LmReaders类属于edu.berkeley.nlp.lm.io包,在下文中一共展示了LmReaders类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: estimateLanguaModel
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
@Override
public void estimateLanguaModel(int order, String textPath, String arpaPath) {
//String trainArpaFilePath = workingDirPath+"/"+corpusSourceName+".arpa";
List<String> inputFiles = new ArrayList<String>();
inputFiles.add(textPath);
final StringWordIndexer wordIndexer = new StringWordIndexer();
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
// System.out.printf("Info: estimating language model of %s (lmorder %d) and saving in %s \n",corpusSourceFilePath,lmOrder,trainArpaFilePath);
LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, order, new File(arpaPath), new ConfigOptions());
}
开发者ID:nicolashernandez,项目名称:dev-star,代码行数:17,代码来源:BerkeleyLanguageModel.java
示例2: BerkeleyRawLanguageModel
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public BerkeleyRawLanguageModel(File berkeleyLm) {
if (!berkeleyLm.isFile()) {
throw new RuntimeException("You need to specify a BerkeleyLM file: " + berkeleyLm);
}
File vocabFile = new File(berkeleyLm.getParent(), "vocab_cs.gz");
if (!vocabFile.exists()) {
throw new RuntimeException("No vocabulary file 'vocab_cs.gz' found in the BerkeleyLM directory: " + vocabFile);
}
map = LmReaders.readNgramMapFromBinary(berkeleyLm.getAbsolutePath(), vocabFile.getAbsolutePath());
/* For some reason, this crashes with IndexOutOfBoundsException:
System.out.println("---START");
Map<List<String>, LongRef> mapForOrder = map.getMapForOrder(2);
for (Map.Entry<List<String>, LongRef> entry : mapForOrder.entrySet()) {
System.out.println("E: " + entry.getKey());
}
System.out.println("---DONE");*/
}
开发者ID:languagetool-org,项目名称:languagetool,代码行数:18,代码来源:BerkeleyRawLanguageModel.java
示例3: createFromFiles
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
/**
*
* read ngram model from pretokenized sentences.
* Expects one sentence per line, tokens separated by space.
*
* @see see edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText for reference
* @param src_dir
* @param accept_file_regex_pattern
* @param order
* @param arpa_filename (possibly null)
* @return
*/
public static BerkeleyLM<String> createFromFiles(final String src_dir, final String accept_file_regex_pattern, final int order, String arpa_filename, double discount, int mincount) {
File src_dir_ = new File(src_dir);
if(!src_dir_.isDirectory())
throw new IllegalArgumentException(String.format("Expected directory but got %s", src_dir_.getAbsolutePath()));
String src_dir_name = src_dir_.getName();
List<String> files = Arrays.asList(src_dir_.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return new File(dir,name).isFile() && name.matches(accept_file_regex_pattern);
}
}));
File arpa_file;
if(arpa_filename == null)
arpa_file = new File(src_dir, src_dir_name + ".arpa.gz");
else
arpa_file = new File(arpa_filename);
final StringWordIndexer wordIndexer = new StringWordIndexer(); //indexer with default symbols
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
ConfigOptions opts = new ConfigOptions();
opts.kneserNeyMinCounts = new double[order];
Arrays.fill(opts.kneserNeyMinCounts, mincount);
if(discount < 0){
opts.kneserNeyDiscounts = null;
}else{
opts.kneserNeyDiscounts = new double[order];
Arrays.fill(opts.kneserNeyDiscounts, discount);
}
opts.unknownWordLogProb = Properties.knUnkLog10Prob();
LmReaders.createKneserNeyLmFromTextFiles(files, wordIndexer, order, arpa_file, new ConfigOptions());
return loadFromArpaFile(arpa_file.getAbsolutePath());
}
开发者ID:tudarmstadt-lt,项目名称:topicrawler,代码行数:49,代码来源:BerkeleyLM.java
示例4: LMGrammarBerkeley
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public LMGrammarBerkeley(int order, String lm_file) {
super(order);
vocabIdToMyIdMapping = new int[10];
if (!new File(lm_file).exists()) {
throw new RuntimeException("Can't read lm_file '" + lm_file + "'");
}
if (logRequests) {
logger.addHandler(logHandler);
logger.setLevel(Level.FINEST);
logger.setUseParentHandlers(false);
}
try { // try binary format (even gzipped)
lm = (ArrayEncodedNgramLanguageModel<String>) LmReaders.<String>readLmBinary(lm_file);
LOG.info("Loading Berkeley LM from binary {}", lm_file);
} catch (RuntimeException e) {
ConfigOptions opts = new ConfigOptions();
LOG.info("Loading Berkeley LM from ARPA file {}", lm_file);
final StringWordIndexer wordIndexer = new StringWordIndexer();
ArrayEncodedNgramLanguageModel<String> berkeleyLm =
LmReaders.readArrayEncodedLmFromArpa(lm_file, false, wordIndexer, opts, order);
lm = ArrayEncodedCachingLmWrapper.wrapWithCacheThreadSafe(berkeleyLm);
}
this.unkIndex = lm.getWordIndexer().getOrAddIndex(lm.getWordIndexer().getUnkSymbol());
}
开发者ID:apache,项目名称:incubator-joshua,代码行数:29,代码来源:LMGrammarBerkeley.java
示例5: createLM
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public static void createLM(final String fileName, final float lm[][],
final int index, final int corpus[][]) {
jobs.execute(new Runnable() {
@Override
public void run() {
log.info("Creating language model");
NgramLanguageModel<String> createdLM = null;
final int lmOrder = 4;
final List<String> inputFiles = new ArrayList<String>();
inputFiles.add(fileName);
final StringWordIndexer wordIndexer = new StringWordIndexer();
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
createdLM = LmReaders
.readContextEncodedKneserNeyLmFromTextFile(inputFiles,
wordIndexer, lmOrder, new ConfigOptions(),
new File(fileName + ".lm"));
lm[index] = new float[corpus.length];
for (int i = 0; i < corpus.length; i++) {
int sent[] = corpus[i];
lm[index][i] = getLMProb(createdLM, sent);
}
log.info(".");
InvitationModel.latch.countDown();
}
});
}
开发者ID:amirkamran,项目名称:InvitationModel,代码行数:38,代码来源:InvitationModel.java
示例6: readLmFromFile
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
/**
* decides if the lm is stored in a binary file (based on extensions .b,
* .bi, .bin, .binary)
*
* @param lmfile
*/
private void readLmFromFile(String lmfile) {
System.err.println("Loading language model from " + lmfile);
StringWordIndexer swi = new StringWordIndexer();
NgramLanguageModel<String> ngramLm;
if (lmfile.endsWith(".b") || lmfile.endsWith(".bi")
|| lmfile.endsWith(".bin") || lmfile.endsWith("binary")) {
ngramLm = LmReaders.readLmBinary(lmfile);
} else {
ngramLm = LmReaders
.readArrayEncodedLmFromArpa(lmfile, false, swi);
}
lm = ArrayEncodedCachingLmWrapper
.wrapWithCacheNotThreadSafe((ArrayEncodedNgramLanguageModel<String>) ngramLm);
}
开发者ID:siddBanPsu,项目名称:WikiKreator,代码行数:21,代码来源:MyBerkeleyLm.java
示例7: readLmFromFile
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
private void readLmFromFile(String lmfile) {
System.err.println("Loading language model from " + lmfile);
StringWordIndexer swi = new StringWordIndexer();
NgramLanguageModel<String> ngramLm;
if (lmfile.endsWith(".b") || lmfile.endsWith(".bi")
|| lmfile.endsWith(".bin") || lmfile.endsWith("binary")) {
ngramLm = LmReaders.readLmBinary(lmfile);
} else {
ngramLm = LmReaders
.readArrayEncodedLmFromArpa(lmfile, false, swi);
}
lm = ArrayEncodedCachingLmWrapper
.wrapWithCacheNotThreadSafe((ArrayEncodedNgramLanguageModel<String>) ngramLm);
}
开发者ID:siddBanPsu,项目名称:WikiKreator,代码行数:15,代码来源:LMReader.java
示例8: loadLanguageModel
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
/**
* The return value should be casted in ArrayEncodedProbBackoffLm<String>
*/
@Override
public void loadLanguageModel(String arpaPath) {
File arpaFile = new File(arpaPath);
ConfigOptions configOptions = new ConfigOptions();
configOptions.storeRankedProbBackoffs = false;
configOptions.unknownWordLogProb = -10.0f;
setLM( LmReaders.readArrayEncodedLmFromArpa(arpaFile.getPath(), false, new StringWordIndexer(), configOptions,
Integer.MAX_VALUE));
}
开发者ID:nicolashernandez,项目名称:dev-star,代码行数:15,代码来源:BerkeleyLanguageModel.java
示例9: BerkeleyLanguageModel
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public BerkeleyLanguageModel(File berkeleyLm) {
if (!berkeleyLm.isFile()) {
throw new RuntimeException("You need to specify a BerkeleyLM file (*.blm.gz): " + berkeleyLm);
}
File vocabFile = new File(berkeleyLm.getParent(), "vocab_cs.gz");
if (!vocabFile.exists()) {
throw new RuntimeException("No vocabulary file 'vocab_cs.gz' found in the BerkeleyLM directory: " + vocabFile);
}
lm = LmReaders.readGoogleLmBinary(berkeleyLm.getAbsolutePath(), vocabFile.getAbsolutePath());
}
开发者ID:languagetool-org,项目名称:languagetool,代码行数:11,代码来源:BerkeleyLanguageModel.java
示例10: readFromBinary
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public static <W> BerkeleyLM<W> readFromBinary(File file) {
@SuppressWarnings("unchecked")
ArrayEncodedNgramLanguageModel<W> berkeley_language_model = (ArrayEncodedNgramLanguageModel<W>) LmReaders.readLmBinary(file.getAbsolutePath());
return new BerkeleyLM<W>(berkeley_language_model);
}
开发者ID:tudarmstadt-lt,项目名称:topicrawler,代码行数:6,代码来源:LanguageModelHelper.java
示例11: saveAsBinary
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public static <W> void saveAsBinary(BerkeleyLM<W> blm, File file) {
LmReaders.writeLmBinary(blm.get(), file.getAbsolutePath());
}
开发者ID:tudarmstadt-lt,项目名称:topicrawler,代码行数:4,代码来源:LanguageModelHelper.java
示例12: createFileFromText
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
public static void createFileFromText() {
// MakeKneserNeyArpaFromText.main(new String[] { "3", "lmorder3.arpa", "in/1.txt" });
String txtfile = "src/test/resources/test.txt";
String arpafile = "_svnignore/test.arpa.gz";
String binfile = "_svnignore/test.blm.gz";
// if (!new File(arpafile).exists()) {
final StringWordIndexer wordIndexer = new StringWordIndexer();
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
ConfigOptions opts = new ConfigOptions();
opts.kneserNeyDiscounts = new double[] { 0.75f, 0.6f, 0.6f };
opts.kneserNeyMinCounts = new double[] { 0, 0, 0, 0, 0, 0, 0 };
final TextReader<String> reader = new TextReader<String>(Arrays.asList(txtfile), wordIndexer);
KneserNeyLmReaderCallback<String> kneserNeyReader = new KneserNeyLmReaderCallback<String>(wordIndexer, 3, opts);
reader.parse(kneserNeyReader);
// NgramLanguageModel<String> lm = kneserNeyReader;
kneserNeyReader.parse(new KneserNeyFileWritingLmReaderCallback<String>(new File(arpafile), wordIndexer));
// }
// if (!new File(binfile).exists()) {
// // HASH OPT
NgramLanguageModel<String> lm = LmReaders.readArrayEncodedLmFromArpa(arpafile, false);
// // CONTEXT OPT
// // NgramLanguageModel<String> lm = LmReaders.readContextEncodedLmFromArpa(arpafile);
// // HASH COMPRESS OPT
// // NgramLanguageModel<String> lm = LmReaders.readArrayEncodedLmFromArpa(arpafile, true);
//
// LmReaders.writeLmBinary(lm, binfile);
// }
//
// // NgramLanguageModel<String> lm = LmReaders.readLmBinary(binfile);
// NgramLanguageModel<String> lm = LmReaders.readArrayEncodedLmFromArpa(arpafile, false);
System.out.println(lm.getLogProb(Arrays.asList("Hallo")));
System.out.println(lm.getLogProb(Arrays.asList("schöne", "neue", "welt")));
System.out.println(lm.getLogProb(Arrays.asList("schöne", "neue", "pups")));
System.out.println(lm.getLogProb(Arrays.asList("schöne", "neue", "globus")));
System.out.println(lm.getLogProb(Arrays.asList("schöne", "neue", "erde")));
// LmReaders.readn
}
开发者ID:tudarmstadt-lt,项目名称:topicrawler,代码行数:48,代码来源:BerkeleyLmPlayground.java
示例13: loadLanguageModel
import edu.berkeley.nlp.lm.io.LmReaders; //导入依赖的package包/类
/**
* The return value should be casted in ArrayEncodedProbBackoffLm<String>
*/
@Override
public void loadLanguageModel(String binaryPath) {
this.blm =(ArrayEncodedNgramLanguageModel) LmReaders.readLmBinary(binaryPath);
}
开发者ID:nicolashernandez,项目名称:dev-star,代码行数:8,代码来源:BinaryBerkeleyLanguageModel.java
注:本文中的edu.berkeley.nlp.lm.io.LmReaders类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论