本文整理汇总了Java中org.deeplearning4j.text.sentenceiterator.BasicLineIterator类的典型用法代码示例。如果您正苦于以下问题:Java BasicLineIterator类的具体用法?Java BasicLineIterator怎么用?Java BasicLineIterator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
BasicLineIterator类属于org.deeplearning4j.text.sentenceiterator包,在下文中一共展示了BasicLineIterator类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: testFindNamesFromText
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;
//test model,Whether the model find out name from unknow text;
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ChineseTokenizerTest.java
示例2: testWord2VecPlot
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:ManualTests.java
示例3: testWord2VecMultiEpoch
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:27,代码来源:Word2VecTests.java
示例4: testWord2VecGoogleModelUptraining
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:Word2VecTests.java
示例5: testVocab
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testVocab() throws Exception {
File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
Set<String> set = new HashSet<>();
int lines = 0;
int cnt = 0;
while (iter.hasNext()) {
Tokenizer tok = t.create(iter.nextSentence());
for (String token : tok.getTokens()) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
cnt++;
if (!set.contains(token))
set.add(token);
}
lines++;
}
log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
log.info("Set:\n" + set);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:VocabConstructorTest.java
示例6: hasNext
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void hasNext() throws Exception {
SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
.tokenizerFactory(factory).build();
Iterator<Sequence<VocabWord>> iter = transformer.iterator();
int cnt = 0;
Sequence<VocabWord> sequence = null;
while (iter.hasNext()) {
sequence = iter.next();
assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
cnt++;
}
// log.info("Last element: {}", sequence.asLabels());
assertEquals(97162, cnt);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ParallelTransformerIteratorTest.java
示例7: nextDocument
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void nextDocument() throws Exception {
SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();
int cnt = 0;
while (backed.hasNextDocument()) {
backed.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
backed.reset();
AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
cnt = 0;
while (iterator.hasNext()) {
iterator.next();
cnt++;
if (cnt == 10)
iterator.reset();
}
assertEquals(97172, cnt);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:AsyncLabelAwareIteratorTest.java
示例8: testHasNextDocument1
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testHasNextDocument1() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
int cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
LabelsSource generator = iterator.getLabelsSource();
assertEquals(97162, generator.getLabels().size());
assertEquals("DOCZ_0", generator.getLabels().get(0));
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:BasicLabelAwareIteratorTest.java
示例9: trainParagraghVecModel
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
/*
if you don't have LabelAwareIterator handy, you can use synchronized labels generator
it will be used to label each document/sequence/line with it's own label.
But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
*/
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(100)
.epochs(1)
.layerSize(50)
.learningRate(0.02)
.labelsSource(source)
.windowSize(5)
.iterate(iter)
.trainWordVectors(true)
.vocabCache(cache)
.tokenizerFactory(t)
.sampling(0)
.build();
vec.fit();
WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
开发者ID:gizemsogancioglu,项目名称:biosses,代码行数:34,代码来源:SentenceVectorsBasedSimilarity.java
示例10: testWord2VecAdaGrad
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testWord2VecAdaGrad() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(5).learningRate(0.025).layerSize(100)
.seed(42).batchSize(13500).sampling(0).negativeSample(0)
//.epochs(10)
.windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false)
.useHierarchicSoftmax(true).iterate(iter).workers(4).tokenizerFactory(t).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:28,代码来源:Word2VecTests.java
示例11: testWord2VecCBOW
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testWord2VecCBOW() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertTrue(sim > 0.65f);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:28,代码来源:Word2VecTests.java
示例12: ASCIICoOccurrenceReader
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
public ASCIICoOccurrenceReader(@NonNull File file, @NonNull VocabCache<T> vocabCache) {
this.vocabCache = vocabCache;
this.file = file;
try {
iterator = new PrefetchingSentenceIterator.Builder(new BasicLineIterator(file)).build();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:10,代码来源:ASCIICoOccurrenceReader.java
示例13: testHasNextDocument2
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testHasNextDocument2() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
int cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
iterator.reset();
cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
LabelsSource generator = iterator.getLabelsSource();
// this is important moment. Iterator after reset should not increase number of labels attained
assertEquals(97162, generator.getLabels().size());
assertEquals("DOCZ_0", generator.getLabels().get(0));
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:33,代码来源:BasicLabelAwareIteratorTest.java
示例14: main
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
File file = srcFile.getFile();
SentenceIterator iter = new BasicLineIterator(file);
TokenizerFactory tFact = new DefaultTokenizerFactory();
tFact.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource labelFormat = new LabelsSource("LINE_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(5)
.epochs(1)
.layerSize(100)
.learningRate(0.025)
.labelsSource(labelFormat)
.windowSize(5)
.iterate(iter)
.trainWordVectors(false)
.tokenizerFactory(tFact)
.sampling(0)
.build();
vec.fit();
double similar1 = vec.similarity("LINE_9835", "LINE_12492");
out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);
double similar2 = vec.similarity("LINE_3720", "LINE_16392");
out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);
double similar3 = vec.similarity("LINE_6347", "LINE_3720");
out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);
double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:46,代码来源:ClassifyBySimilarity.java
示例15: testWord2VecCBOWBig
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
//iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
//SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
TokenizerFactory t = new KoreanTokenizerFactory();
//t = new DefaultTokenizerFactory();
//t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.allowParallelTokenization(true).tokenizerFactory(t)
.elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
long time1 = System.currentTimeMillis();
vec.fit();
long time2 = System.currentTimeMillis();
log.info("Total execution time: {}", (time2 - time1));
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:PerformanceTests.java
示例16: testRunWord2Vec
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testRunWord2Vec() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
//.negativeSample(10)
.epochs(1).windowSize(5).allowParallelTokenization(true)
.modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
File tempFile = File.createTempFile("temp", "temp");
tempFile.deleteOnExit();
WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
Collection<String> lst = vec.wordsNearest("day", 10);
//log.info(Arrays.toString(lst.toArray()));
printWords("day", lst, vec);
assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(sim < 1.0);
assertTrue(sim > 0.4);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertFalse(lst.contains(null));
lst = vec.wordsNearest("day", 10);
//log.info(Arrays.toString(lst.toArray()));
printWords("day", lst, vec);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
new File("cache.ser").delete();
ArrayList<String> labels = new ArrayList<>();
labels.add("day");
labels.add("night");
labels.add("week");
INDArray matrix = vec.getWordVectors(labels);
assertEquals(matrix.getRow(0), vec.getWordVectorMatrix("day"));
assertEquals(matrix.getRow(1), vec.getWordVectorMatrix("night"));
assertEquals(matrix.getRow(2), vec.getWordVectorMatrix("week"));
WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:65,代码来源:Word2VecTests.java
示例17: testW2VnegativeOnRestore
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
.windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
log.info("Fit 1");
vec.fit();
File tmpFile = File.createTempFile("temp", "file");
tmpFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
iter.reset();
Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
log.info("Fit 2");
restoredVec.fit();
iter.reset();
restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
log.info("Fit 3");
restoredVec.fit();
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:52,代码来源:Word2VecTests.java
示例18: testUnknown1
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
.unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
.useHierarchicSoftmax(true).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:50,代码来源:Word2VecTests.java
示例19: testOutputStream
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
@Test
public void testOutputStream() throws Exception {
File file = File.createTempFile("tmp_ser", "ssa");
file.deleteOnExit();
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache(false);
WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
.cache(cache).lr(0.025f).build();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
.lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
.vocabCache(cache).seed(42)
// .workers(6)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
INDArray day1 = vec.getWordVectorMatrix("day");
WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
INDArray day2 = vec2.getWordVectorMatrix("day");
assertEquals(day1, day2);
File tempFile = File.createTempFile("tetsts", "Fdfs");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:WordVectorSerializerTest.java
示例20: testParagraphVectorsVocabBuilding1
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; //导入依赖的package包/类
/**
* This test checks, how vocab is built using SentenceIterator provided, without labels.
*
* @throws Exception
*/
@Test
public void testParagraphVectorsVocabBuilding1() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();//.getParentFile();
SentenceIterator iter = new BasicLineIterator(file); //UimaSentenceIterator.createWithPath(file.getAbsolutePath());
int numberOfLines = 0;
while (iter.hasNext()) {
iter.nextSentence();
numberOfLines++;
}
iter.reset();
InMemoryLookupCache cache = new InMemoryLookupCache(false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
// LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).layerSize(100)
// .labelsGenerator(source)
.windowSize(5).iterate(iter).vocabCache(cache).tokenizerFactory(t).build();
vec.buildVocab();
LabelsSource source = vec.getLabelsSource();
//VocabCache cache = vec.getVocab();
log.info("Number of lines in corpus: " + numberOfLines);
assertEquals(numberOfLines, source.getLabels().size());
assertEquals(97162, source.getLabels().size());
assertNotEquals(null, cache);
assertEquals(97406, cache.numWords());
// proper number of words for minWordsFrequency = 1 is 244
assertEquals(244, cache.numWords() - source.getLabels().size());
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:47,代码来源:ParagraphVectorsTest.java
注:本文中的org.deeplearning4j.text.sentenceiterator.BasicLineIterator类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论