本文整理汇总了Java中org.archive.io.ArchiveReader类的典型用法代码示例。如果您正苦于以下问题:Java ArchiveReader类的具体用法?Java ArchiveReader怎么用?Java ArchiveReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ArchiveReader类属于org.archive.io包,在下文中一共展示了ArchiveReader类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: processWarc
import org.archive.io.ArchiveReader; //导入依赖的package包/类
private void processWarc(Path warcFile) throws IOException {
extractorStats.addWarc(warcFile.getFileName().toString());
InputStream is = Files.newInputStream(warcFile);
ArchiveReader reader = WARCReaderFactory.get(warcFile.toString(), is, true);
int i = 0;
reader.setStrict(false);
for (ArchiveRecord record : reader) {
record.setStrict(false);
extractorStats.visitedRecord();
handleRecord(record);
if (i++ % 1000 == 0) {
System.err.println(extractorStats);
}
}
}
开发者ID:tballison,项目名称:SimpleCommonCrawlExtractor,代码行数:17,代码来源:AbstractExtractor.java
示例2: generate
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void generate(Path path, int numPages) throws Exception {
Gson gson = new Gson();
long count = 0;
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
log.debug("Skipping {}", p.getUrl());
continue;
}
log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
String json = gson.toJson(p);
writer.write(json);
writer.newLine();
count++;
if (count == numPages) {
break;
} else if ((count % 1000) == 0) {
log.info("Wrote {} of {} pages to {}", count, numPages, path);
}
}
}
log.info("Wrote {} pages to {}", numPages, path);
}
开发者ID:astralway,项目名称:webindex,代码行数:27,代码来源:SampleData.java
示例3: test
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Test
public void test() throws IOException {
List<Cdx.CdxRecord> records;
URL resource = getClass().getResource("example.warc.gz");
try (ArchiveReader warc = ArchiveReaderFactory.get(resource)) {
records = Cdx.records(warc, "example.warc.gz", resource.openConnection().getContentLength()).collect(Collectors.toList());
}
assertEquals(2, records.size());
Cdx.Capture record = (Cdx.Capture)records.get(0);
assertEquals("text/html", record.contentType);
assertEquals(200, record.status);
assertEquals("20161116220655", record.date);
assertEquals("http://www-test.nla.gov.au/xinq/presentations/abstract.html", record.url);
assertEquals(2756, record.compressedLength);
assertEquals(339, record.offset);
assertEquals("387f5ef1511fe47bf91ca9fdcf6c41511fc3e480", record.digest);
}
开发者ID:nla,项目名称:bamboo,代码行数:20,代码来源:CdxTest.java
示例4: test2
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Test
public void test2() throws IOException {
List<Cdx.CdxRecord> records;
URL resource = getClass().getResource("notfound.warc.gz");
try (ArchiveReader warc = ArchiveReaderFactory.get(resource)) {
records = Cdx.records(warc, "notfound.warc.gz", resource.openConnection().getContentLength()).collect(Collectors.toList());
}
assertEquals(3, records.size());
Cdx.Capture record = (Cdx.Capture)records.get(0);
assertEquals("text/html", record.contentType);
assertEquals(302, record.status);
assertEquals("20161128015313", record.date);
assertEquals("http://nla.gov.au/foobar", record.url);
assertEquals(665, record.compressedLength);
assertEquals(830, record.offset);
assertEquals("WNT4SKWUNA5F4Q3HYKF5AMY2M5ZIPBYW", record.digest);
assertEquals("http://www.nla.gov.au/foobar", record.location);
}
开发者ID:nla,项目名称:bamboo,代码行数:21,代码来源:CdxTest.java
示例5: findFirstRecordWithUri
import org.archive.io.ArchiveReader; //导入依赖的package包/类
private ArchiveRecord findFirstRecordWithUri(File f, String uri)
throws IOException {
ArchiveReader r = ARCReaderFactory.get(f);
Iterator<ArchiveRecord> it = r.iterator();
ArchiveRecord record = it.next(); //Skip ARC file header
// ARCReaderFactory guarantees the first record exists and is a
// filedesc, or it would throw exception
// next record should contain INPUT_1_ARC
while (it.hasNext()) {
record = it.next();
if (record.getHeader().getUrl().equals(uri)){
return record;
}
}
return null;
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:20,代码来源:ARCUtilsTester.java
示例6: get
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public BitarchiveRecord get(String arcfile, long index)
throws ArgumentNotValid {
ArgumentNotValid.checkNotNull(arcfile, "arcfile");
ArgumentNotValid.checkNotNegative(index, "index");
File in = new File(arcDir, arcfile);
try {
ArchiveReader arcReader = ArchiveReaderFactory.get(in);
ArchiveRecord arc = arcReader.get(index);
BitarchiveRecord result = new BitarchiveRecord(arc, arcfile);
return result;
} catch (IOException e) {
throw new IOFailure("Error reading record from " + arcfile + " offset " + index, e);
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:17,代码来源:TestArcRepositoryClient.java
示例7: map
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context)
throws IOException {
for (ArchiveRecord r : value) {
// Skip any records that are not JSON
if (!r.getHeader().getMimetype().equals("application/json")) {
continue;
}
String sourceURL = r.getHeader().getUrl();
if (StringUtils.isBlank(sourceURL))
continue;
outKey.set(sourceURL);
try {
context.write(outKey, NullWritable.get());
} catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
} finally {
IOUtils.closeQuietly(r);
}
}
}
开发者ID:DigitalPebble,项目名称:NutchFight,代码行数:23,代码来源:URLExtractor.java
示例8: initialize
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
//throw new UnsupportedOperationException("Unused.");
FileSplit fileSplit = (FileSplit) is;
try {
Path path = fileSplit.getPath();
FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());
FSDataInputStream fileInputStream = fileSystem.open(path);
FileStatus fileStatus = fileSystem.getFileStatus(path);
fileLength = fileStatus.getLen();
ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
recordIterator = reader.iterator();
currentKey = new Text();
currentArcRecord = new ArcRecord();
} catch (IOException ex) {
Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
开发者ID:shsdev,项目名称:archiventory,代码行数:26,代码来源:ArcRecordReader.java
示例9: getArchiveReader
import org.archive.io.ArchiveReader; //导入依赖的package包/类
protected ArchiveReader getArchiveReader(final File arcFile,
final boolean skipSuffixTest, final long offset)
throws IOException {
boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
if (!compressed) {
if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
throw new IOException(arcFile.getAbsolutePath() +
" is not an Internet Archive ARC file.");
}
}
return compressed?
(ARCReader)ARCReaderFactory.factory.
new CompressedARCReader(arcFile, offset):
(ARCReader)ARCReaderFactory.factory.
new UncompressedARCReader(arcFile, offset);
}
开发者ID:iipc,项目名称:webarchive-commons,代码行数:18,代码来源:ARCReaderFactory.java
示例10: offsetResourceTest
import org.archive.io.ArchiveReader; //导入依赖的package包/类
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();
final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);
final long position = record.getPosition();
final long recordLength = record.getHeader().getLength();
assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);
// Clean up:
if( raf != null )
raf.close();
}
开发者ID:iipc,项目名称:webarchive-commons,代码行数:22,代码来源:ARCReaderFactoryTest.java
示例11: main
import org.archive.io.ArchiveReader; //导入依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-23/segments/1404776400583.60/warc/CC-MAIN-20140707234000-00000-ip-10-180-212-248.ec2.internal.warc.gz";
// String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
String fn = url;
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:36,代码来源:WARCReaderTest.java
示例12: map
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
// Compile the regular expression once as it will be used continuously
patternTag = Pattern.compile(HTML_TAG_PATTERN);
for (ArchiveRecord r : value) {
try {
LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
// We're only interested in processing the responses, not requests or metadata
if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
// Convenience function that reads the full message into a raw byte array
byte[] rawData = IOUtils.toByteArray(r, r.available());
String content = new String(rawData);
// The HTTP header gives us valuable information about what was received during the request
String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
// In our task, we're only interested in text/html, so we can be a little lax
// TODO: Proper HTTP header parsing + don't trust headers
if (headerText.contains("Content-Type: text/html")) {
context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
// Only extract the body of the HTTP response when necessary
// Due to the way strings work in Java, we don't use any more memory than before
String body = content.substring(content.indexOf("\r\n\r\n") + 4);
// Process all the matched HTML tags found in the body of the document
matcherTag = patternTag.matcher(body);
while (matcherTag.find()) {
String tagName = matcherTag.group(1);
outKey.set(tagName.toLowerCase());
context.write(outKey, outVal);
}
}
}
}
catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
}
}
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:40,代码来源:TagCounterMap.java
示例13: map
import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
for (ArchiveRecord r : value) {
try {
if (r.getHeader().getMimetype().equals("text/plain")) {
context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
// Convenience function that reads the full message into a raw byte array
byte[] rawData = IOUtils.toByteArray(r, r.available());
String content = new String(rawData);
// Grab each word from the document
tokenizer = new StringTokenizer(content);
if (!tokenizer.hasMoreTokens()) {
context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
} else {
while (tokenizer.hasMoreTokens()) {
outKey.set(tokenizer.nextToken());
context.write(outKey, outVal);
}
}
} else {
context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
}
}
catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
}
}
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:31,代码来源:WordCounterMap.java
示例14: main
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String[] args) {
if (args.length != 1) {
log.error("Usage: CalcSplits <dataDir>");
System.exit(1);
}
final String dataDir = args[0];
IndexEnv.validateDataDir(dataDir);
SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
IndexStats stats = new IndexStats(ctx);
final JavaPairRDD<Text, ArchiveReader> archives =
ctx.newAPIHadoopFile(dataDir, WARCFileInputFormat.class, Text.class, ArchiveReader.class,
new Configuration());
JavaRDD<Page> pages = IndexUtil.createPages(archives);
JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
JavaPairRDD<RowColumn, Bytes> accumuloIndex =
IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
SortedSet<Text> splits = IndexUtil.calculateSplits(accumuloIndex, 100);
log.info("Accumulo splits:");
splits.forEach(System.out::println);
}
}
开发者ID:astralway,项目名称:webindex,代码行数:29,代码来源:CalcSplits.java
示例15: main
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.error("Usage: TestParser <pathsFile> <range>");
System.exit(1);
}
final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
if (loadList.isEmpty()) {
log.error("No files to load given {} {}", args[0], args[1]);
System.exit(1);
}
WebIndexConfig.load();
SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
args[0]);
JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
final String prefix = WebIndexConfig.CC_URL_PREFIX;
loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
String urlToCopy = prefix + path;
log.info("Parsing {}", urlToCopy);
try {
ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
for (ArchiveRecord record : reader) {
ArchiveUtil.buildPageIgnoreErrors(record);
}
} catch (Exception e) {
log.error("Exception while processing {}", path, e);
}
}));
}
}
开发者ID:astralway,项目名称:webindex,代码行数:39,代码来源:TestParser.java
示例16: main
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length > 1) {
log.error("Usage: Init [<dataDir>]");
System.exit(1);
}
WebIndexConfig webIndexConfig = WebIndexConfig.load();
IndexEnv env = new IndexEnv(webIndexConfig);
env.setFluoTableSplits();
log.info("Initialized Fluo table splits");
if (args.length == 1) {
final String dataDir = args[0];
IndexEnv.validateDataDir(dataDir);
SparkConf sparkConf = new SparkConf().setAppName("webindex-init");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
IndexStats stats = new IndexStats(ctx);
final JavaPairRDD<Text, ArchiveReader> archives =
ctx.newAPIHadoopFile(dataDir, WARCFileInputFormat.class, Text.class,
ArchiveReader.class, new Configuration());
JavaRDD<Page> pages = IndexUtil.createPages(archives);
env.initializeIndexes(ctx, pages, stats);
stats.print();
}
} else {
log.info("An init data dir was not specified");
}
}
开发者ID:astralway,项目名称:webindex,代码行数:35,代码来源:Init.java
示例17: createPages
import org.archive.io.ArchiveReader; //导入依赖的package包/类
/**
* Creates an RDD of pages from an RDD archive
*/
public static JavaRDD<Page> createPages(JavaPairRDD<Text, ArchiveReader> archives) {
int numPartitions = 50 * (int) archives.count();
JavaRDD<ArchiveRecord> records = archives.flatMap(Tuple2::_2);
return records.map(ArchiveUtil::buildPageIgnoreErrors).repartition(numPartitions)
.persist(StorageLevel.DISK_ONLY_2());
}
开发者ID:astralway,项目名称:webindex,代码行数:10,代码来源:IndexUtil.java
示例18: readPages
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static Map<URL, Page> readPages(File input) throws Exception {
Map<URL, Page> pageMap = new HashMap<>();
ArchiveReader ar = WARCReaderFactory.get(input);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
continue;
}
pageMap.put(URL.fromUri(p.getUri()), p);
}
ar.close();
return pageMap;
}
开发者ID:astralway,项目名称:webindex,代码行数:14,代码来源:IndexIT.java
示例19: CdxRecordProducer
import org.archive.io.ArchiveReader; //导入依赖的package包/类
CdxRecordProducer(ArchiveReader warc, String filename, long warcLength) {
iterator = Iterators.peekingIterator(warc.iterator());
this.filename = filename;
this.warcLength = warcLength;
if (filename.startsWith("nla.arc-")) {
pandoraHacks = true;
}
}
开发者ID:nla,项目名称:bamboo,代码行数:10,代码来源:Cdx.java
示例20: main
import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String args[]) {
for (String arg : args) {
try (ArchiveReader reader = ArchiveReaderFactory.get(arg)) {
for (ArchiveRecord record : reader) {
SolrInputDocument doc = makeDoc(record);
if (doc != null) {
System.out.println(doc.toString());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
开发者ID:nla,项目名称:bamboo,代码行数:15,代码来源:SolrIndexer.java
注:本文中的org.archive.io.ArchiveReader类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论