1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.writer;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.File;
27 import java.io.IOException;
28 import java.net.InetAddress;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.net.UnknownHostException;
32 import java.text.ParseException;
33 import java.util.Collection;
34 import java.util.Date;
35 import java.util.HashMap;
36 import java.util.Map;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.logging.Logger;
40
41 import org.apache.commons.httpclient.Header;
42 import org.apache.commons.httpclient.HttpMethodBase;
43 import org.apache.commons.httpclient.HttpStatus;
44 import org.apache.commons.lang.StringUtils;
45 import org.archive.crawler.Heritrix;
46 import org.archive.crawler.datamodel.CoreAttributeConstants;
47 import org.archive.crawler.datamodel.CrawlURI;
48 import org.archive.crawler.datamodel.FetchStatusCodes;
49 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
50 import org.archive.crawler.event.CrawlStatusListener;
51 import org.archive.crawler.extractor.Link;
52 import org.archive.crawler.framework.WriterPoolProcessor;
53 import org.archive.crawler.settings.SimpleType;
54 import org.archive.crawler.settings.Type;
55 import org.archive.io.ReplayInputStream;
56 import org.archive.io.WriterPoolMember;
57 import org.archive.io.WriterPoolSettings;
58 import org.archive.io.warc.WARCWriter;
59 import org.archive.io.warc.WARCConstants;
60 import org.archive.io.warc.WARCWriterPool;
61 import org.archive.uid.GeneratorFactory;
62 import org.archive.util.ArchiveUtils;
63 import org.archive.util.XmlUtils;
64 import org.archive.util.anvl.ANVLRecord;
65 import org.w3c.dom.Document;
66
67 /***
68 * WARCWriterProcessor.
69 * Goes against the 0.18 version of the WARC specification (which
70 * is functionally identical to 0.17 except in the protocol
71 * identifier string).
72 * See http://archive-access.sourceforge.net/warc/
73 *
74 * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
75 * (commons-httpclient?) or find something else.
76 *
77 * @author stack
78 */
79 public class WARCWriterProcessor extends WriterPoolProcessor
80 implements CoreAttributeConstants, CrawlStatusListener,
81 WriterPoolSettings, FetchStatusCodes, WARCConstants {
82 private static final long serialVersionUID = 6182850087635847443L;
83
84 private final Logger logger = Logger.getLogger(this.getClass().getName());
85
86 /***
87 * Key for whether to write 'request' type records where possible
88 */
89 public static final String ATTR_WRITE_REQUESTS =
90 "write-requests";
91
92 /***
93 * Key for whether to write 'metadata' type records where possible
94 */
95 public static final String ATTR_WRITE_METADATA =
96 "write-metadata";
97
98 /***
99 * Key for whether to write 'revisit' type records when
100 * consecutive identical digest
101 */
102 public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
103 "write-revisit-for-identical-digests";
104
105 /***
106 * Key for whether to write 'revisit' type records for server
107 * "304 not modified" responses
108 */
109 public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
110 "write-revisit-for-not-modified";
111
112 /***
113 * Default path list.
114 */
115 private static final String [] DEFAULT_PATH = {"warcs"};
116
117 protected String [] getDefaultPath() {
118 return DEFAULT_PATH;
119 }
120
121 /***
122 * @param name Name of this writer.
123 */
124 public WARCWriterProcessor(final String name) {
125 super(name, "Experimental WARCWriter processor (Version 0.17)");
126 Type e = addElementToDefinition(
127 new SimpleType(ATTR_WRITE_REQUESTS,
128 "Whether to write 'request' type records. " +
129 "Default is true.", new Boolean(true)));
130 e.setOverrideable(true);
131 e.setExpertSetting(true);
132 e = addElementToDefinition(
133 new SimpleType(ATTR_WRITE_METADATA,
134 "Whether to write 'metadata' type records. " +
135 "Default is true.", new Boolean(true)));
136 e.setOverrideable(true);
137 e.setExpertSetting(true);
138 e = addElementToDefinition(
139 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
140 "Whether to write 'revisit' type records when a URI's " +
141 "history indicates the previous fetch had an identical " +
142 "content digest. " +
143 "Default is true.", new Boolean(true)));
144 e.setOverrideable(true);
145 e.setExpertSetting(true);
146 e = addElementToDefinition(
147 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
148 "Whether to write 'revisit' type records when a " +
149 "304-Not Modified response is received. " +
150 "Default is true.", new Boolean(true)));
151 e.setOverrideable(true);
152 e.setExpertSetting(true);
153 }
154
155 protected void setupPool(final AtomicInteger serialNo) {
156 setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
157 getPoolMaximumWait()));
158 }
159
160 /***
161 * Writes a CrawlURI and its associated data to store file.
162 *
163 * Currently this method understands the following uri types: dns, http, and
164 * https.
165 *
166 * @param curi CrawlURI to process.
167 *
168 */
169 protected void innerProcess(CrawlURI curi) {
170
171 if (curi.getFetchStatus() <= 0) {
172 return;
173 }
174
175
176 long recordLength = curi.getContentSize();
177 if (recordLength <= 0) {
178
179
180 return;
181 }
182
183 String scheme = curi.getUURI().getScheme().toLowerCase();
184 try {
185 if (shouldWrite(curi)) {
186 write(scheme, curi);
187 } else {
188 logger.info("This writer does not write out scheme " +
189 scheme + " content");
190 }
191 } catch (IOException e) {
192 curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
193 curi.toString());
194 logger.log(Level.SEVERE, "Failed write of Record: " +
195 curi.toString(), e);
196 }
197 }
198
199 protected void write(final String lowerCaseScheme, final CrawlURI curi)
200 throws IOException {
201 WriterPoolMember writer = getPool().borrowFile();
202 long position = writer.getPosition();
203
204
205
206 writer.checkSize();
207 if (writer.getPosition() != position) {
208
209
210
211 setTotalBytesWritten(getTotalBytesWritten() +
212 (writer.getPosition() - position));
213 position = writer.getPosition();
214 }
215
216 WARCWriter w = (WARCWriter)writer;
217 try {
218
219
220 final URI baseid = getRecordID();
221 final String timestamp =
222 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
223 if (lowerCaseScheme.startsWith("http")) {
224
225
226
227
228 ANVLRecord headers = new ANVLRecord(5);
229 if (curi.getContentDigest() != null) {
230 headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
231 curi.getContentDigestSchemeString());
232 }
233 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
234 URI rid;
235
236 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
237 ((Boolean)getUncheckedAttribute(curi,
238 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
239 rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
240 baseid, curi, headers);
241 } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED &&
242 ((Boolean)getUncheckedAttribute(curi,
243 ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
244 rid = writeRevisitNotModified(w, timestamp,
245 baseid, curi, headers);
246 } else {
247 if (curi.isTruncatedFetch()) {
248 String value = curi.isTimeTruncatedFetch()?
249 NAMED_FIELD_TRUNCATED_VALUE_TIME:
250 curi.isLengthTruncatedFetch()?
251 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
252 curi.isHeaderTruncatedFetch()?
253 NAMED_FIELD_TRUNCATED_VALUE_HEAD:
254
255 TRUNCATED_VALUE_UNSPECIFIED;
256 headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
257 }
258 rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
259 baseid, curi, headers);
260 }
261
262 headers = new ANVLRecord(1);
263 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
264 '<' + rid.toString() + '>');
265
266 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
267 writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
268 baseid, curi, headers);
269 }
270 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
271 writeMetadata(w, timestamp, baseid, curi, headers);
272 }
273 } else if (lowerCaseScheme.equals("dns")) {
274 ANVLRecord headers = null;
275 String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
276 if (ip != null && ip.length() > 0) {
277 headers = new ANVLRecord(1);
278 headers.addLabelValue(HEADER_KEY_IP, ip);
279 }
280 writeResponse(w, timestamp, curi.getContentType(), baseid,
281 curi, headers);
282 } else {
283 logger.warning("No handler for scheme " + lowerCaseScheme);
284 }
285 } catch (IOException e) {
286
287 getPool().invalidateFile(writer);
288
289
290
291 writer = null;
292 throw e;
293 } finally {
294 if (writer != null) {
295 setTotalBytesWritten(getTotalBytesWritten() +
296 (writer.getPosition() - position));
297 getPool().returnFile(writer);
298 }
299 }
300 checkBytesWritten();
301 }
302
303 protected URI writeRequest(final WARCWriter w,
304 final String timestamp, final String mimetype,
305 final URI baseid, final CrawlURI curi,
306 final ANVLRecord namedFields)
307 throws IOException {
308 final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
309 ReplayInputStream ris =
310 curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
311 try {
312 w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
313 namedFields, ris,
314 curi.getHttpRecorder().getRecordedOutput().getSize());
315 } finally {
316 if (ris != null) {
317 ris.close();
318 }
319 }
320 return uid;
321 }
322
323 protected URI writeResponse(final WARCWriter w,
324 final String timestamp, final String mimetype,
325 final URI baseid, final CrawlURI curi,
326 final ANVLRecord namedFields)
327 throws IOException {
328 ReplayInputStream ris =
329 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
330 try {
331 w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
332 namedFields, ris,
333 curi.getHttpRecorder().getRecordedInput().getSize());
334 } finally {
335 if (ris != null) {
336 ris.close();
337 }
338 }
339 return baseid;
340 }
341
342 protected URI writeResource(final WARCWriter w,
343 final String timestamp, final String mimetype,
344 final URI baseid, final CrawlURI curi,
345 final ANVLRecord namedFields)
346 throws IOException {
347 ReplayInputStream ris =
348 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
349 try {
350 w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
351 namedFields, ris,
352 curi.getHttpRecorder().getRecordedInput().getSize());
353 } finally {
354 if (ris != null) {
355 ris.close();
356 }
357 }
358 return baseid;
359 }
360
361 protected URI writeRevisitDigest(final WARCWriter w,
362 final String timestamp, final String mimetype,
363 final URI baseid, final CrawlURI curi,
364 final ANVLRecord namedFields)
365 throws IOException {
366 long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
367 revisedLength = revisedLength > 0
368 ? revisedLength
369 : curi.getHttpRecorder().getRecordedInput().getSize();
370 namedFields.addLabelValue(
371 HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
372 namedFields.addLabelValue(
373 HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
374 ReplayInputStream ris =
375 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
376 try {
377 w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
378 namedFields, ris, revisedLength);
379 } finally {
380 if (ris != null) {
381 ris.close();
382 }
383 }
384 curi.addAnnotation("warcRevisit:digest");
385 return baseid;
386 }
387
388 protected URI writeRevisitNotModified(final WARCWriter w,
389 final String timestamp,
390 final URI baseid, final CrawlURI curi,
391 final ANVLRecord namedFields)
392 throws IOException {
393 namedFields.addLabelValue(
394 HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
395
396 if(curi.containsKey(A_HTTP_TRANSACTION)) {
397 HttpMethodBase method =
398 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
399 saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
400 saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
401 HEADER_KEY_LAST_MODIFIED);
402 }
403
404 namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
405 NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
406 ReplayInputStream ris =
407 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
408 try {
409 w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
410 namedFields, ris, 0);
411 } finally {
412 if (ris != null) {
413 ris.close();
414 }
415 }
416 curi.addAnnotation("warcRevisit:notModified");
417 return baseid;
418 }
419
420 /***
421 * Save a header from the given HTTP operation into the
422 * provider headers under a new name
423 *
424 * @param origName header name to get if present
425 * @param method http operation containing headers
426 */
427 protected void saveHeader(String origName, HttpMethodBase method,
428 ANVLRecord headers, String newName) {
429 Header header = method.getResponseHeader(origName);
430 if(header!=null) {
431 headers.addLabelValue(newName, header.getValue());
432 }
433 }
434
435 protected URI writeMetadata(final WARCWriter w,
436 final String timestamp,
437 final URI baseid, final CrawlURI curi,
438 final ANVLRecord namedFields)
439 throws IOException {
440 final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
441
442
443
444
445 ANVLRecord r = new ANVLRecord();
446 if (curi.isSeed()) {
447 r.addLabel("seed");
448 } else {
449 if (curi.forceFetch()) {
450 r.addLabel("force-fetch");
451 }
452 r.addLabelValue("via", curi.flattenVia());
453 r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
454 if (curi.containsKey(A_SOURCE_TAG)) {
455 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
456 }
457 }
458 long duration = curi.getFetchDuration();
459 if(duration>-1) {
460 r.addLabelValue("fetchTimeMs", Long.toString(duration));
461 }
462
463
464 Collection<Link> links = curi.getOutLinks();
465 if (links != null && links.size() > 0) {
466 for (Link link: links) {
467 r.addLabelValue("outlink", link.toString());
468 }
469 }
470
471
472
473
474
475
476
477
478
479
480 byte [] b = r.getUTF8Bytes();
481 w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
482 uid, namedFields, new ByteArrayInputStream(b), b.length);
483 return uid;
484 }
485
486 protected URI getRecordID() throws IOException {
487 URI result;
488 try {
489 result = GeneratorFactory.getFactory().getRecordID();
490 } catch (URISyntaxException e) {
491 throw new IOException(e.toString());
492 }
493 return result;
494 }
495
496 protected URI qualifyRecordID(final URI base, final String key,
497 final String value)
498 throws IOException {
499 URI result;
500 Map<String, String> qualifiers = new HashMap<String, String>(1);
501 qualifiers.put(key, value);
502 try {
503 result = GeneratorFactory.getFactory().
504 qualifyRecordID(base, qualifiers);
505 } catch (URISyntaxException e) {
506 throw new IOException(e.toString());
507 }
508 return result;
509 }
510
511 @Override
512 protected String getFirstrecordStylesheet() {
513 return "/warcinfobody.xsl";
514 }
515
516 /***
517 * Return relevant values as header-like fields (here ANVLRecord, but
518 * spec-defined "application/warc-fields" type when written). Field
519 * names from from DCMI Terms and the WARC/0.17 specification.
520 *
521 * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
522 */
523 @Override
524 protected String getFirstrecordBody(File orderFile) {
525 ANVLRecord record = new ANVLRecord(7);
526 record.addLabelValue("software", "Heritrix/" +
527 Heritrix.getVersion() + " http://crawler.archive.org");
528 try {
529 InetAddress host = InetAddress.getLocalHost();
530 record.addLabelValue("ip", host.getHostAddress());
531 record.addLabelValue("hostname", host.getHostName());
532 } catch (UnknownHostException e) {
533 logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
534 }
535 record.addLabelValue("format","WARC File Format 0.17");
536 record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc");
537
538 try {
539 Document doc = XmlUtils.getDocument(orderFile);
540 addIfNotBlank(record,"operator",
541 XmlUtils.xpathOrNull(doc,"//meta/operator"));
542 addIfNotBlank(record,"publisher",
543 XmlUtils.xpathOrNull(doc,"//meta/organization"));
544 addIfNotBlank(record,"audience",
545 XmlUtils.xpathOrNull(doc,"//meta/audience"));
546 addIfNotBlank(record,"isPartOf",
547 XmlUtils.xpathOrNull(doc,"//meta/name"));
548 String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
549 if(StringUtils.isNotBlank(rawDate)) {
550 Date date;
551 try {
552 date = ArchiveUtils.parse14DigitDate(rawDate);
553 addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
554 } catch (ParseException e) {
555 logger.log(Level.WARNING,"obtaining warc created date",e);
556 }
557 }
558 addIfNotBlank(record,"description",
559 XmlUtils.xpathOrNull(doc,"//meta/description"));
560 addIfNotBlank(record,"robots",
561 XmlUtils.xpathOrNull(doc,
562 "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
563 addIfNotBlank(record,"http-header-user-agent",
564 XmlUtils.xpathOrNull(doc,
565 "//map[@name='http-headers']/string[@name='user-agent']"));
566 addIfNotBlank(record,"http-header-from",
567 XmlUtils.xpathOrNull(doc,
568 "//map[@name='http-headers']/string[@name='from']"));
569 } catch (IOException e) {
570 logger.log(Level.WARNING,"obtaining warcinfo",e);
571 }
572
573
574
575 return record.toString();
576 }
577
578
579 protected void addIfNotBlank(ANVLRecord record, String label, String value) {
580 if(StringUtils.isNotBlank(value)) {
581 record.addLabelValue(label, value);
582 }
583 }
584 }