本文整理汇总了Java中org.archive.io.warc.WARCRecord类的典型用法代码示例。如果您正苦于以下问题:Java WARCRecord类的具体用法?Java WARCRecord怎么用?Java WARCRecord使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
WARCRecord类属于org.archive.io.warc包,在下文中一共展示了WARCRecord类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: readHeaderLine
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private String readHeaderLine(WARCRecord warc) {
StringBuilder sb = new StringBuilder();
try {
char c;
char previous = '\n';
do {
c = (char) warc.read();
if (c == '\n' && previous == '\r') {
// trim the CR (\r) from last iteration
sb.deleteCharAt(sb.length() - 1);
break;
}
sb.append((char) c);
previous = c;
} while (c != -1);
return sb.toString();
} catch (IOException e) {
return sb.toString();
}
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:21,代码来源:Page.java
示例2: wrapArchiveHeader
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Construct a Heritrix record header wrapper object.
* @param recordWrapper wrapped Heritrix header
* @param record original Heritrix record
* @return wrapped Heritrix record header
*/
public static HeritrixArchiveHeaderWrapper wrapArchiveHeader(HeritrixArchiveRecordWrapper recordWrapper, ArchiveRecord record) {
//ArgumentNotValid.checkNotNull(recordWrapper, "recordWrapper");
ArgumentNotValid.checkNotNull(record, "record");
HeritrixArchiveHeaderWrapper headerWrapper = new HeritrixArchiveHeaderWrapper();
headerWrapper.recordWrapper = recordWrapper;
headerWrapper.header = record.getHeader();
Map<String, Object> heritrixHeaderFields = (Map<String, Object>)headerWrapper.header.getHeaderFields();
Iterator<Map.Entry<String, Object>> iter = heritrixHeaderFields.entrySet().iterator();
Map.Entry<String, Object> entry;
while (iter.hasNext()) {
entry = iter.next();
headerWrapper.headerFields.put(entry.getKey().toLowerCase(), entry.getValue());
}
if (record instanceof ARCRecord) {
headerWrapper.bIsArc = true;
} else if (record instanceof WARCRecord) {
headerWrapper.bIsWarc = true;
} else {
throw new ArgumentNotValid(
"Unsupported ArchiveRecord type: "
+ record.getClass().getName());
}
return headerWrapper;
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:31,代码来源:HeritrixArchiveHeaderWrapper.java
示例3: getMimetypeBatchFilter
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Note that the mimetype of the WARC responserecord is not (necessarily) the same as its payload.
* @param mimetype String denoting the mimetype this filter represents
* @return a BatchFilter that filters out all WARCRecords, that does not
* have this mimetype
* @throws MimeTypeParseException If mimetype is invalid
*/
public static WARCBatchFilter getMimetypeBatchFilter(final String mimetype)
throws MimeTypeParseException {
ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype");
if (!mimetypeIsOk(mimetype)) {
throw new MimeTypeParseException("Mimetype argument '" + mimetype
+ "' is invalid");
}
return new WARCBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) {
public boolean accept(WARCRecord record) {
HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record);
return recordWrapper.getHeader().getMimetype().startsWith(
mimetype);
}
};
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:23,代码来源:WARCBatchFilter.java
示例4: testARCReaderClose
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
public void testARCReaderClose() {
try {
final File testfile = new File(ARCHIVE_DIR + testFileName);
FileUtils.copyFile(new File(ARCHIVE_DIR + "fyensdk.warc"),
testfile);
WARCReader reader = WARCReaderFactory.get(testfile);
WARCRecord record = (WARCRecord) reader.get(0);
BitarchiveRecord rec =
new BitarchiveRecord(record, testFileName);
record.close();
reader.close();
testfile.delete();
} catch (IOException e) {
fail("Should not throw IOException " + e);
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:19,代码来源:WARCReaderTester.java
示例5: next
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
@Override
public WARCRecord next() {
if (!warcRecordsIterator.hasNext()) {
return null;
}
return warcRecordsIterator.next();
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:8,代码来源:WarcTargetRepository.java
示例6: shouldStoreAndIterageOverData
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
@Test
public void shouldStoreAndIterageOverData() throws IOException {
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html, responseHeaders);
target.setTargetRelevance(TargetRelevance.RELEVANT);
target.setFetchTime(System.currentTimeMillis());
WarcTargetRepository repository = new WarcTargetRepository(folder);
// when
repository.insert(target);
repository.close();
File testFolder = new File(folder);
if (testFolder.isDirectory()) {
File[] allFiles = testFolder.listFiles();
assertTrue(allFiles[0].getName().startsWith("crawl_data"));
}
Iterator<WARCRecord> it = repository.iterator();
// then
assertThat(it.hasNext(), is(true));
WARCRecord page = it.next();
assertThat(it.hasNext(), is(false));
assertThat(page.getHeader().getUrl(), is(url));
assertThat(page.getHeader().getHeaderValue("Content-Type"),
is(WARCConstants.HTTP_RESPONSE_MIMETYPE));
assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"),
is(target.getTargetRelevance().isRelevant() + ""));
assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()),
is(Double.valueOf(target.getTargetRelevance().getRelevance())));
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:40,代码来源:WarcTargetRepositoryTest.java
示例7: testReadingMultipleWarcRecordsUsingIterator
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
@Test
public void testReadingMultipleWarcRecordsUsingIterator() throws Exception {
// given
String folder = tempFolder.newFolder().toString();
String url1 = "http://a.com";
String url2 = "http://b.com";
Page target1 = new Page(new URL(url1), html);
Page target2 = new Page(new URL(url2), html);
WarcTargetRepository repository = new WarcTargetRepository(folder);
// when
repository.insert(target1);
repository.insert(target2);
repository.close();
RepositoryIterator respositoryIterator = repository.iterator();
// then
assertTrue(respositoryIterator.hasNext());
WARCRecord record = respositoryIterator.next();
assertThat(record.getHeader().getUrl(), is(url1));
assertTrue(respositoryIterator.hasNext());
record = respositoryIterator.next();
assertThat(record.getHeader().getUrl(), is(url2));
assertFalse(respositoryIterator.hasNext());
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:31,代码来源:WarcTargetRepositoryTest.java
示例8: testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
@Test
public void testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders() throws Exception {
// given
String folder = tempFolder.newFolder().toString();
String url1 = "http://a.com";
Map<String, List<String>> headers = new HashMap<>();
Character invalidChar = new Character((char) 0x80);
String headerValue = "inline; filename=\"Invalid_" + invalidChar + "\"";
headers.put("Content-Disposition", asList(headerValue));
Page target1 = new Page(new URL(url1), html, headers);
WarcTargetRepository repository = new WarcTargetRepository(folder);
// when
repository.insert(target1);
repository.close();
RepositoryIterator respositoryIterator = repository.iterator();
// then
assertTrue(respositoryIterator.hasNext());
WARCRecord record = respositoryIterator.next();
assertThat(record.getHeader().getUrl(), is(url1));
String recordData = IOUtils.toString(record);
assertThat(recordData, containsString(html));
assertThat(recordData, containsString(headerValue));
assertFalse(respositoryIterator.hasNext());
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:33,代码来源:WarcTargetRepositoryTest.java
示例9: shouldIterateOverEmptyFolder
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
@Test
public void shouldIterateOverEmptyFolder() throws IOException {
// given
String folder = tempFolder.newFolder().toString();
WarcTargetRepository repository = new WarcTargetRepository(folder);
// when
Iterator<WARCRecord> it = repository.iterator();
// then
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:15,代码来源:WarcTargetRepositoryTest.java
示例10: index
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Create and return the index of the ArcHarvestFile.
* @param baseDir the base directory of the arcs
* @throws IOException thrown if there is an error
* @throws ParseException
*/
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
File theArchiveFile = new File(baseDir, this.getName());
ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
this.compressed = reader.isCompressed();
Iterator<ArchiveRecord> it = reader.iterator();
while(it.hasNext()) {
ArchiveRecord rec = it.next();
if(rec instanceof WARCRecord) {
String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
if(type.equals(WARCConstants.RESPONSE)) {
String mime = rec.getHeader().getMimetype();
if(!mime.equals("text/dns")) {
indexWARCResponse(rec, results);
}
}
}
else {
indexARCRecord(rec, results);
}
}
reader.close();
return results;
}
开发者ID:DIA-NZ,项目名称:webcurator,代码行数:35,代码来源:ArcHarvestFileDTO.java
示例11: HeritrixArchiveRecordWrapper
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Construct a Heritrix record wrapper object.
* @param record Heritrix record object
*/
public HeritrixArchiveRecordWrapper(ArchiveRecord record) {
ArgumentNotValid.checkNotNull(record, "record");
this.record = record;
this.header = HeritrixArchiveHeaderWrapper.wrapArchiveHeader(this, record);
if (record instanceof ARCRecord) {
this.bIsArc = true;
} else if (record instanceof WARCRecord) {
this.bIsWarc = true;
} else {
throw new ArgumentNotValid(
"Unsupported ArchiveRecord type: "
+ record.getClass().getName());
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:19,代码来源:HeritrixArchiveRecordWrapper.java
示例12: getSize
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* For an ARCRecord, this is the length of the record as defined in the
* header. For a WARCRecods, this is the payload length, defined as the
* difference between the total record length and the size of the header.
* @return the length of the record content in bytes.
*
*/
@Override
public long getSize() {
if (record instanceof ARCRecord) {
return record.getHeader().getLength();
} else if (record instanceof WARCRecord) {
// The length of the payload of the warc-record is not getLength(),
// but getLength minus getContentBegin(), which is the number of
// bytes used for the record-header!
return record.getHeader().getLength() - record.getHeader().getContentBegin();
} else {
throw new ArgumentNotValid("Unknown type of ArchiveRecord: " + record.getClass());
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:21,代码来源:ExtendedFTPRemoteFile.java
示例13: adaptWARCHTTPResponse
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
WARCRecord rec) throws IOException {
ArchiveRecordHeader header = rec.getHeader();
// need to parse the documents HTTP message and headers here: WARCReader
// does not implement this... yet..
byte [] statusBytes = HttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
" was expected: " +
((statusBytes == null) ? "(null)" : new String(statusBytes)));
}
String statusLine = EncodingUtil.getString(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
if ((statusLine == null) ||
!StatusLine.startsWithHTTP(statusLine)) {
throw new RecoverableIOException("Failed parse of http status line.");
}
StatusLine status = new StatusLine(statusLine);
result.setHttpCode(String.valueOf(status.getStatusCode()));
Header[] headers = HttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());
return result;
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:32,代码来源:NetarchiveSuiteWARCRecordToSearchResultAdapter.java
示例14: testOneJob_ExceptionInProcess
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Verifies that thrown Exceptions in process get collected
* TODO Check more error conditions -- the exception handling is tricky!
*/
public void testOneJob_ExceptionInProcess() {
WARCBatchJob job = new TestWARCBatchJob() {
public void processRecord(WARCRecord record, OutputStream os) {
super.processRecord(record, new ByteArrayOutputStream());
if (!((processed - 1)
< RECORDS_PROCESSED_BEFORE_EXCEPTION)) {
throw new ArgumentNotValid(
"testOneJob_ExceptionInProcess");
}
}
};
job.processFile(WARC_FILE, new ByteArrayOutputStream());
Exception[] es = job.getExceptionArray();
assertEquals("Should have gotten through all records",
TOTAL_RECORDS, processed);
final int numExceptions = TOTAL_RECORDS
- RECORDS_PROCESSED_BEFORE_EXCEPTION;
if (numExceptions != es.length) {
printExceptions(es);
}
assertEquals("Exceptions list should have one entry per failing record",
numExceptions, es.length);
for (int i = 0; i < numExceptions; i++) {
assertTrue("Exception should be of type ArgumentNotValid",
es[i] instanceof ArgumentNotValid);
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:32,代码来源:WARCBatchJobTester.java
示例15: processRecord
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
/**
* Increases the processed counter by 1
* and records the record URL.
*/
public void processRecord(WARCRecord wr, OutputStream os) {
processed++;
ArchiveRecordBase record = new HeritrixArchiveRecordWrapper(wr);
ArchiveHeaderBase header = record.getHeader();
lastSeenURL = header.getUrl();
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:12,代码来源:WARCBatchJobTester.java
示例16: getType
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private String getType(ArchiveRecord nativeRecord) {
if (nativeRecord instanceof WARCRecord) {
WARCRecord warcRecord = (WARCRecord) nativeRecord;
return warcRecord.getHeader().getHeaderValue(HEADER_KEY_TYPE).toString();
} else {
return "response";
}
}
开发者ID:shsdev,项目名称:archiventory,代码行数:9,代码来源:ArcRecordReader.java
示例17: getID
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private String getID(ArchiveRecord nativeRecord) {
if (nativeRecord instanceof ARCRecord) {
ARCRecord arcRecord = (ARCRecord) nativeRecord;
ArchiveRecordHeader header = nativeRecord.getHeader();
return header.getRecordIdentifier();
} else if (nativeRecord instanceof WARCRecord) {
WARCRecord warcRecord = (WARCRecord) nativeRecord;
return warcRecord.getHeader().getHeaderValue(HEADER_KEY_ID).toString();
}
return getResourceUrl(nativeRecord);
}
开发者ID:shsdev,项目名称:archiventory,代码行数:13,代码来源:ArcRecordReader.java
示例18: getHttpHeaders
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private Header[] getHttpHeaders(ArchiveRecord nativeRecord) throws IOException {
if (nativeRecord instanceof ARCRecord) {
return ((ARCRecord) nativeRecord).getHttpHeaders();
} else if (nativeRecord instanceof WARCRecord) {
WARCRecord warcRecord = (WARCRecord) nativeRecord;
if (warcRecord.hasContentHeaders()) {
Header[] headers = HttpParser.parseHeaders(nativeRecord, DEFAULT_ENCODING);
return headers;
}
}
return new Header[0];
}
开发者ID:shsdev,项目名称:archiventory,代码行数:13,代码来源:ArcRecordReader.java
示例19: getMimeType
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
private String getMimeType(ArchiveRecord nativeRecord, Header[] headers) {
// *** 4 cases are covered here
// 1) ARCRecord
// 2) WARCRecord with a HTTPHeader (WARC RESPONSE records)
// 3) WARCRecord - no HttpHeader (WARCINFO record [the WARC container header] and DNS records)
// 4) Neither a ARCRecord nor a WARCRecord (REQUEST and METADATA records of WARCs)
// *** 1, 3, 4 do return the record MIME TYPE (which is is the content MIME TYPE of ARC records and the record MIME TYPE of WARC REQUEST and METADATA records)
// *** 2 returns the MIME TYPE stored in the HTTPHeader of the RESPONSE (content) record.
// Otherwise this record returns: "application/http; msgtype=response") - which is true too but not the information we want. We want to see the MIME TYPE of the content stream as the result.
// CASE 2:
if (nativeRecord instanceof WARCRecord) {
if (headers != null && headers.length >= 1) {
String currentHeaderName;
for (Header currentHeader : headers) {
currentHeaderName = currentHeader.getName().toLowerCase();
if (currentHeaderName.equals("content-type")) {
return currentHeader.getValue();
}
}
}
}
// CASE 1, 3, 4:
return nativeRecord.getHeader().getMimetype();
}
开发者ID:shsdev,项目名称:archiventory,代码行数:29,代码来源:ArcRecordReader.java
示例20: testParseHttpHeadersInWARC
import org.archive.io.warc.WARCRecord; //导入依赖的package包/类
public void testParseHttpHeadersInWARC() throws IOException {
final String url = "http://foo.maths.uq.edu.au/index.html";
// final String warcHeader = "WARC/0.10 000000000486 response " +
// url + " 20070315152520 " +
// "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
// "application/http; msgtype=response\r\n" +
// "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
// "IP-Address: 80.150.6.184\r\n" +
// "\r\n";
final String warcHeader = "WARC/0.12\r\n"
+ "MIME-Version: 1.0\r\n"
+ "WARC-Record-Type: response\r\n"
+ "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
+ "WARC-Date: 2006-09-19T17:20:24Z\r\n"
+ "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
+ "WARC-IP-Address: 80.150.6.184\r\n"
+ "Content-ID: <urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58>\r\n"
+ "Content-Type: application/http; msgtype=response\r\n"
+ "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
+ "\r\n";
final String hdr = warcHeader + HTTPHEADER + BODY;
WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
"READER_IDENTIFIER", 0, false, true);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
String bodyRead = new String(b);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
assertEquals("failed to retrieve Url from metadata", har.getHeader()
.getUrl(), url);
}
开发者ID:iipc,项目名称:webarchive-commons,代码行数:39,代码来源:HeaderedArchiveRecordTest.java
注:本文中的org.archive.io.warc.WARCRecord类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论