本文整理汇总了Java中org.jwat.warc.WarcReader类的典型用法代码示例。如果您正苦于以下问题:Java WarcReader类的具体用法?Java WarcReader怎么用?Java WarcReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
WarcReader类属于org.jwat.warc包,在下文中一共展示了WarcReader类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: getNextResponseRecord
import org.jwat.warc.WarcReader; //导入依赖的package包/类
private RecordWithOffsetsAndURL getNextResponseRecord(WarcReader warcReader)
throws IOException {
WarcRecord wr;
while (true) {
try {
wr = warcReader.getNextRecord();
} catch (IOException e) {
continue;
}
if (wr == null)
return null;
long offset = warcReader.getStartOffset();
String type = wr.getHeader("WARC-Type").value;
if (type.equals("response")) {
byte[] rawContent = IOUtils.toByteArray(wr
.getPayloadContent());
long endOffset = warcReader.getOffset();
String url = wr.getHeader(WARC_TARGET_URI).value;
return new RecordWithOffsetsAndURL(rawContent, offset, endOffset, url);
}
}
}
开发者ID:JulianEberius,项目名称:dwtc-extractor,代码行数:24,代码来源:Worker.java
示例2: setReaderOptions
import org.jwat.warc.WarcReader; //导入依赖的package包/类
/**
* Set digest options for WARC reader.
* @param reader WARC reader instance
*/
protected void setReaderOptions(WarcReader reader) throws JHOVE2Exception {
reader.setBlockDigestEnabled(bComputeBlockDigest);
reader.setPayloadDigestEnabled(bComputePayloadDigest);
if (!reader.setBlockDigestAlgorithm(blockDigestAlgorithm)) {
throw new JHOVE2Exception("Invalid block digest algorithm: " + blockDigestAlgorithm);
}
if (!reader.setPayloadDigestAlgorithm(payloadDigestAlgorithm)) {
throw new JHOVE2Exception("Invalid payload digest algorithm: " + payloadDigestAlgorithm);
}
reader.setBlockDigestEncoding(blockDigestEncoding);
reader.setPayloadDigestEncoding(payloadDigestEncoding);
if (bStrictTargetUriValidation) {
reader.setWarcTargetUriProfile(UriProfile.RFC3986);
} else {
reader.setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
}
if (bStrictUriValidation) {
reader.setUriProfile(UriProfile.RFC3986);
} else {
reader.setUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
}
}
开发者ID:opf-labs,项目名称:jhove2,代码行数:27,代码来源:WarcModule.java
示例3: parseRecordsUncompressed
import org.jwat.warc.WarcReader; //导入依赖的package包/类
/**
* Parse WARC records that are not encased in GZip entries. Parsing should
* should be straight forward with all records accessible through the same
* source.
* @param jhove2 the JHove2 characterization context
* @param sourceFactory JHove2 source factory
* @param parentSource WARC source unit
* @param reader WARC reader used to parse records
* @throws EOFException if EOF occurs prematurely
* @throws IOException if an IO error occurs while processing
* @throws JHOVE2Exception if a serious problem needs to be reported
*/
protected void parseRecordsUncompressed(JHOVE2 jhove2, SourceFactory sourceFactory,
Source parentSource, WarcReader reader)
throws EOFException, IOException, JHOVE2Exception {
WarcRecord record;
// Ensure a WARC reader could be instantiated.
if (reader != null) {
parentSource.setIsAggregate(true);
/*
* Loop through available records.
*/
while ((record = reader.getNextRecord()) != null) {
processRecord(jhove2, sourceFactory, parentSource, record);
}
} else {
throw new JHOVE2Exception("WarcReader is null");
}
}
开发者ID:opf-labs,项目名称:jhove2,代码行数:30,代码来源:WarcModule.java
示例4: parseRecordsCompressed
import org.jwat.warc.WarcReader; //导入依赖的package包/类
/**
* Parse WARC record(s) where the source has been identified as a source of
* a GZip module instance. Since each record will presumably be parse from
* a different source alternative methods in the WARC reader will be used.
* @param jhove2 the JHove2 characterization context
* @param sourceFactory JHove2 source factory
* @param parentSource WARC source unit
* @param reader WARC reader used to parse records
* @param offset record offset relative to input stream
* @throws EOFException if EOF occurs prematurely
* @throws IOException if an IO error occurs while processing
* @throws JHOVE2Exception if a serious problem needs to be reported
*/
protected void parseRecordsCompressed(JHOVE2 jhove2, SourceFactory sourceFactory,
Source parentSource, WarcReader reader, Long offset)
throws EOFException, IOException, JHOVE2Exception {
WarcRecord record;
// Ensure a WARC reader could be instantiated.
if (reader != null) {
parentSource.setIsAggregate(true);
InputStream in = parentSource.getInputStream();
/*
* Loop through available records.
*/
while ((record = reader.getNextRecordFrom(in, offset, 8192)) != null) {
processRecord(jhove2, sourceFactory, parentSource, record);
}
} else {
throw new JHOVE2Exception("WarcReader is null");
}
}
开发者ID:opf-labs,项目名称:jhove2,代码行数:32,代码来源:WarcModule.java
示例5: fullText
import org.jwat.warc.WarcReader; //导入依赖的package包/类
public String fullText(String s3Link, long startOffset, long endOffset)
throws IOException, ServiceException {
S3Object inputObject = s3.getObject("aws-publicdatasets", s3Link, null,
null, null, null, startOffset, endOffset);
WarcReader warcReader = WarcReaderFactory
.getReaderCompressed(inputObject.getDataInputStream());
WarcRecord wr = warcReader.getNextRecord();
String s = IOUtils.toString(wr.getPayloadContent());
inputObject.closeDataInputStream();
warcReader.close();
return s;
}
开发者ID:JulianEberius,项目名称:dwtc-tools,代码行数:13,代码来源:CCFulltext.java
示例6: test
import org.jwat.warc.WarcReader; //导入依赖的package包/类
@Test
public void test() throws IOException {
Path crawlPath = temp.newFolder().toPath();
Path outdir = temp.newFolder().toPath();
TestUtils.unzip(HttrackRecordTest.class.getResourceAsStream("testcrawl-3.49-2.zip"), crawlPath);
Httrack2Warc httrack2Warc = new Httrack2Warc();
httrack2Warc.setOutputDirectory(outdir);
httrack2Warc.convert(crawlPath);
StringBuilder summary = new StringBuilder();
try (WarcReader warcReader = WarcReaderFactory.getReaderCompressed(Files.newInputStream(outdir.resolve("crawl-0.warc.gz")))) {
for (WarcRecord warcRecord: warcReader) {
String type = getHeader(warcRecord, "WARC-Type");
String url = getHeader(warcRecord, "WARC-Target-URI");
summary.append(type).append(" ").append(url).append("\n");
if (type.equals("request") || type.equals("response")) {
HttpHeader httpHeader = warcRecord.getHttpHeader();
assertEquals("HTTP/1.1", httpHeader.httpVersion);
} else if (type.equals("warcinfo")) {
String payload = slurp(warcRecord.getPayloadContent());
assertEquals("software: HTTrack/3.49-2 http://www.httrack.com/\r\n" +
"software: httrack2warc https://github.com/nla/httrack2warc\r\n" +
"httrackOptions: -%H http://test.example.org/\r\n", payload);
}
}
}
assertEquals("warcinfo null\n" +
"response http://test.example.org/\n" +
"request http://test.example.org/\n" +
"metadata http://test.example.org/\n" +
"response http://test.example.org/style.css\n" +
"request http://test.example.org/style.css\n" +
"metadata http://test.example.org/style.css\n" +
"response http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"request http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"metadata http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"response http://test.example.org/another\n" +
"request http://test.example.org/another\n" +
"metadata http://test.example.org/another\n" +
"response http://test.example.org/image.gif\n" +
"request http://test.example.org/image.gif\n" +
"metadata http://test.example.org/image.gif\n",
summary.toString());
}
开发者ID:nla,项目名称:httrack2warc,代码行数:47,代码来源:Httrack2WarcTest.java
注:本文中的org.jwat.warc.WarcReader类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论