1   /* $Id: ExperimentalWARCWriterProcessor.java 4935 2007-02-23 00:27:24Z gojomo $
2    *
3    * Created on August 1st, 2006.
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.writer;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.net.InetAddress;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.net.UnknownHostException;
32  import java.text.ParseException;
33  import java.util.Collection;
34  import java.util.Date;
35  import java.util.HashMap;
36  import java.util.Map;
37  import java.util.concurrent.atomic.AtomicInteger;
38  import java.util.logging.Level;
39  import java.util.logging.Logger;
40  
41  import org.apache.commons.httpclient.Header;
42  import org.apache.commons.httpclient.HttpMethodBase;
43  import org.apache.commons.httpclient.HttpStatus;
44  import org.apache.commons.lang.StringUtils;
45  import org.archive.crawler.Heritrix;
46  import org.archive.crawler.datamodel.CoreAttributeConstants;
47  import org.archive.crawler.datamodel.CrawlURI;
48  import org.archive.crawler.datamodel.FetchStatusCodes;
49  import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
50  import org.archive.crawler.event.CrawlStatusListener;
51  import org.archive.crawler.extractor.Link;
52  import org.archive.crawler.framework.WriterPoolProcessor;
53  import org.archive.crawler.settings.SimpleType;
54  import org.archive.crawler.settings.Type;
55  import org.archive.io.ReplayInputStream;
56  import org.archive.io.WriterPoolMember;
57  import org.archive.io.WriterPoolSettings;
58  import org.archive.io.warc.WARCWriter;
59  import org.archive.io.warc.WARCConstants;
60  import org.archive.io.warc.WARCWriterPool;
61  import org.archive.uid.GeneratorFactory;
62  import org.archive.util.ArchiveUtils;
63  import org.archive.util.XmlUtils;
64  import org.archive.util.anvl.ANVLRecord;
65  import org.w3c.dom.Document;
66  
67  /***
68   * WARCWriterProcessor.
69   * Goes against the 0.18 version of the WARC specification (which
70   * is functionally identical to 0.17 except in the protocol 
71   * identifier string). 
72   * See http://archive-access.sourceforge.net/warc/
73   * 
74   * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
75   * (commons-httpclient?) or find something else.
76   * 
77   * @author stack
78   */
79  public class WARCWriterProcessor extends WriterPoolProcessor
80  implements CoreAttributeConstants, CrawlStatusListener,
81  WriterPoolSettings, FetchStatusCodes, WARCConstants {
82      private static final long serialVersionUID = 6182850087635847443L;
83  
84      private final Logger logger = Logger.getLogger(this.getClass().getName());
85      
86      /***
87       * Key for whether to write 'request' type records where possible
88       */
89      public static final String ATTR_WRITE_REQUESTS =
90          "write-requests";
91      
92      /***
93       * Key for whether to write 'metadata' type records where possible
94       */
95      public static final String ATTR_WRITE_METADATA =
96          "write-metadata";
97      
98      /***
99       * Key for whether to write 'revisit' type records when
100      * consecutive identical digest
101      */
102     public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
103         "write-revisit-for-identical-digests";
104     
105     /***
106      * Key for whether to write 'revisit' type records for server
107      * "304 not modified" responses
108      */
109     public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
110         "write-revisit-for-not-modified";
111     
112     /***
113      * Default path list.
114      */
115     private static final String [] DEFAULT_PATH = {"warcs"};
116 
117     protected String [] getDefaultPath() {
118         return DEFAULT_PATH;
119     }
120     
121     /***
122      * @param name Name of this writer.
123      */
124     public WARCWriterProcessor(final String name) {
125         super(name, "Experimental WARCWriter processor (Version 0.17)");
126         Type e = addElementToDefinition(
127                 new SimpleType(ATTR_WRITE_REQUESTS,
128                 "Whether to write 'request' type records. " +
129                 "Default is true.", new Boolean(true)));
130         e.setOverrideable(true);
131         e.setExpertSetting(true);
132         e = addElementToDefinition(
133                 new SimpleType(ATTR_WRITE_METADATA,
134                 "Whether to write 'metadata' type records. " +
135                 "Default is true.", new Boolean(true)));
136         e.setOverrideable(true);
137         e.setExpertSetting(true);
138         e = addElementToDefinition(
139                 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
140                 "Whether to write 'revisit' type records when a URI's " +
141                 "history indicates the previous fetch had an identical " +
142                 "content digest. " +
143                 "Default is true.", new Boolean(true)));
144         e.setOverrideable(true);
145         e.setExpertSetting(true);
146         e = addElementToDefinition(
147                 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
148                 "Whether to write 'revisit' type records when a " +
149                 "304-Not Modified response is received. " +
150                 "Default is true.", new Boolean(true)));
151         e.setOverrideable(true);
152         e.setExpertSetting(true);
153     }
154 
155     protected void setupPool(final AtomicInteger serialNo) {
156 		setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
157             getPoolMaximumWait()));
158     }
159     
160     /***
161      * Writes a CrawlURI and its associated data to store file.
162      * 
163      * Currently this method understands the following uri types: dns, http, and
164      * https.
165      * 
166      * @param curi CrawlURI to process.
167      * 
168      */
169     protected void innerProcess(CrawlURI curi) {
170         // If failure, or we haven't fetched the resource yet, return
171         if (curi.getFetchStatus() <= 0) {
172             return;
173         }
174         
175         // If no recorded content at all, don't write record.
176         long recordLength = curi.getContentSize();
177         if (recordLength <= 0) {
178             // getContentSize() should be > 0 if any material (even just
179             // HTTP headers with zero-length body) is available. 
180         	return;
181         }
182         
183         String scheme = curi.getUURI().getScheme().toLowerCase();
184         try {
185             if (shouldWrite(curi)) {
186                 write(scheme, curi);
187             } else {
188                 logger.info("This writer does not write out scheme " +
189                         scheme + " content");
190             }
191         } catch (IOException e) {
192             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
193                 curi.toString());
194             logger.log(Level.SEVERE, "Failed write of Record: " +
195                 curi.toString(), e);
196         }
197     }
198     
199     protected void write(final String lowerCaseScheme, final CrawlURI curi)
200     throws IOException {
201         WriterPoolMember writer = getPool().borrowFile();
202         long position = writer.getPosition();
203         // See if we need to open a new file because we've exceeed maxBytes.
204         // Call to checkFileSize will open new file if we're at maximum for
205         // current file.
206         writer.checkSize();
207         if (writer.getPosition() != position) {
208             // We just closed the file because it was larger than maxBytes.
209             // Add to the totalBytesWritten the size of the first record
210             // in the file, if any.
211             setTotalBytesWritten(getTotalBytesWritten() +
212             	(writer.getPosition() - position));
213             position = writer.getPosition();
214         }
215         
216         WARCWriter w = (WARCWriter)writer;
217         try {
218             // Write a request, response, and metadata all in the one
219             // 'transaction'.
220             final URI baseid = getRecordID();
221             final String timestamp =
222                 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
223             if (lowerCaseScheme.startsWith("http")) {
224                 // Add named fields for ip, checksum, and relate the metadata
225                 // and request to the resource field.
226                 // TODO: Use other than ANVL (or rename ANVL as NameValue or
227                 // use RFC822 (commons-httpclient?).
228                 ANVLRecord headers = new ANVLRecord(5);
229                 if (curi.getContentDigest() != null) {
230                     headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
231                         curi.getContentDigestSchemeString());
232                 }
233                 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
234                 URI rid;
235                 
236                 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
237                         ((Boolean)getUncheckedAttribute(curi, 
238                                 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
239                     rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
240                             baseid, curi, headers);
241                 } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && 
242                         ((Boolean)getUncheckedAttribute(curi, 
243                                 ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
244                     rid = writeRevisitNotModified(w, timestamp,
245                             baseid, curi, headers);
246                 } else {
247                     if (curi.isTruncatedFetch()) {
248                         String value = curi.isTimeTruncatedFetch()?
249                             NAMED_FIELD_TRUNCATED_VALUE_TIME:
250                             curi.isLengthTruncatedFetch()?
251                                 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
252                                 curi.isHeaderTruncatedFetch()?
253                                     NAMED_FIELD_TRUNCATED_VALUE_HEAD:
254                             // TODO: Add this to spec.
255                             TRUNCATED_VALUE_UNSPECIFIED;
256                         headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
257                     }
258                     rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
259                     	baseid, curi, headers);
260                 }
261                 
262                 headers = new ANVLRecord(1);
263                 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
264                     '<' + rid.toString() + '>');
265 
266                 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
267                     writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
268                             baseid, curi, headers);
269                 }
270                 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
271                     writeMetadata(w, timestamp, baseid, curi, headers);
272                 } 
273             } else if (lowerCaseScheme.equals("dns")) {
274                 ANVLRecord headers = null;
275                 String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
276                 if (ip != null && ip.length() > 0) {
277                     headers = new ANVLRecord(1);
278                     headers.addLabelValue(HEADER_KEY_IP, ip);
279                 }
280                 writeResponse(w, timestamp, curi.getContentType(), baseid,
281                     curi, headers);
282             } else {
283                 logger.warning("No handler for scheme " + lowerCaseScheme);
284             }
285         } catch (IOException e) {
286             // Invalidate this file (It gets a '.invalid' suffix).
287             getPool().invalidateFile(writer);
288             // Set the writer to null otherwise the pool accounting
289             // of how many active writers gets skewed if we subsequently
290             // do a returnWriter call on this object in the finally block.
291             writer = null;
292             throw e;
293         } finally {
294             if (writer != null) {
295             	setTotalBytesWritten(getTotalBytesWritten() +
296             	     (writer.getPosition() - position));
297                 getPool().returnFile(writer);
298             }
299         }
300         checkBytesWritten();
301     }
302     
303     protected URI writeRequest(final WARCWriter w,
304             final String timestamp, final String mimetype,
305             final URI baseid, final CrawlURI curi,
306             final ANVLRecord namedFields) 
307     throws IOException {
308         final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
309         ReplayInputStream ris =
310             curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
311         try {
312             w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
313                 namedFields, ris,
314                 curi.getHttpRecorder().getRecordedOutput().getSize());
315         } finally {
316             if (ris != null) {
317                 ris.close();
318             }
319         }
320         return uid;
321     }
322     
323     protected URI writeResponse(final WARCWriter w,
324             final String timestamp, final String mimetype,
325             final URI baseid, final CrawlURI curi,
326             final ANVLRecord namedFields) 
327     throws IOException {
328         ReplayInputStream ris =
329             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
330         try {
331             w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
332                 namedFields, ris,
333                 curi.getHttpRecorder().getRecordedInput().getSize());
334         } finally {
335             if (ris != null) {
336                 ris.close();
337             }
338         }
339         return baseid;
340     }
341     
342     protected URI writeResource(final WARCWriter w,
343             final String timestamp, final String mimetype,
344             final URI baseid, final CrawlURI curi,
345             final ANVLRecord namedFields) 
346     throws IOException {
347         ReplayInputStream ris =
348             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
349         try {
350             w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
351                 namedFields, ris,
352                 curi.getHttpRecorder().getRecordedInput().getSize());
353         } finally {
354             if (ris != null) {
355                 ris.close();
356             }
357         }
358         return baseid;
359     }
360     
361     protected URI writeRevisitDigest(final WARCWriter w,
362             final String timestamp, final String mimetype,
363             final URI baseid, final CrawlURI curi,
364             final ANVLRecord namedFields) 
365     throws IOException {
366         long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
367         revisedLength = revisedLength > 0 
368             ? revisedLength 
369             : curi.getHttpRecorder().getRecordedInput().getSize();
370         namedFields.addLabelValue(
371         		HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
372         namedFields.addLabelValue(
373         		HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
374         ReplayInputStream ris =
375             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
376         try {
377             w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
378                 namedFields, ris, revisedLength);
379         } finally {
380             if (ris != null) {
381                 ris.close();
382             }
383         }
384         curi.addAnnotation("warcRevisit:digest"); 
385         return baseid;
386     }
387     
388     protected URI writeRevisitNotModified(final WARCWriter w,
389             final String timestamp, 
390             final URI baseid, final CrawlURI curi,
391             final ANVLRecord namedFields) 
392     throws IOException {
393         namedFields.addLabelValue(
394         		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
395         // save just enough context to understand basis of not-modified
396         if(curi.containsKey(A_HTTP_TRANSACTION)) {
397             HttpMethodBase method = 
398                 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
399             saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
400             saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
401             		HEADER_KEY_LAST_MODIFIED);
402         }
403         // truncate to zero-length (all necessary info is above)
404         namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
405             NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
406         ReplayInputStream ris =
407             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
408         try {
409             w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
410                 namedFields, ris, 0);
411         } finally {
412             if (ris !=  null) {
413                 ris.close();
414             }
415         }
416         curi.addAnnotation("warcRevisit:notModified"); 
417         return baseid;
418     }
419     
420     /***
421      * Save a header from the given HTTP operation into the 
422      * provider headers under a new name
423      * 
424      * @param origName header name to get if present
425      * @param method http operation containing headers
426      */
427     protected void saveHeader(String origName, HttpMethodBase method, 
428     		ANVLRecord headers, String newName) {
429         Header header = method.getResponseHeader(origName);
430         if(header!=null) {
431             headers.addLabelValue(newName, header.getValue());
432         }
433     }
434 
435 	protected URI writeMetadata(final WARCWriter w,
436             final String timestamp,
437             final URI baseid, final CrawlURI curi,
438             final ANVLRecord namedFields) 
439     throws IOException {
440         final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
441         // Get some metadata from the curi.
442         // TODO: Get all curi metadata.
443         // TODO: Use other than ANVL (or rename ANVL as NameValue or use
444         // RFC822 (commons-httpclient?).
445         ANVLRecord r = new ANVLRecord();
446         if (curi.isSeed()) {
447             r.addLabel("seed");
448         } else {
449         	if (curi.forceFetch()) {
450         		r.addLabel("force-fetch");
451         	}
452             r.addLabelValue("via", curi.flattenVia());
453             r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
454             if (curi.containsKey(A_SOURCE_TAG)) {
455                 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
456             }
457         }
458         long duration = curi.getFetchDuration();
459         if(duration>-1) {
460             r.addLabelValue("fetchTimeMs", Long.toString(duration));
461         }
462         
463         // Add outlinks though they are effectively useless without anchor text.
464         Collection<Link> links = curi.getOutLinks();
465         if (links != null && links.size() > 0) {
466             for (Link link: links) {
467                 r.addLabelValue("outlink", link.toString());
468             }
469         }
470         
471         // TODO: Other curi fields to write to metadata.
472         // 
473         // Credentials
474         // 
475         // fetch-began-time: 1154569278774
476         // fetch-completed-time: 1154569281816
477         //
478         // Annotations.
479         
480         byte [] b = r.getUTF8Bytes();
481         w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
482             uid, namedFields, new ByteArrayInputStream(b), b.length);
483         return uid;
484     }
485     
486     protected URI getRecordID() throws IOException {
487         URI result;
488         try {
489             result = GeneratorFactory.getFactory().getRecordID();
490         } catch (URISyntaxException e) {
491             throw new IOException(e.toString());
492         }
493         return result;
494     }
495     
496     protected URI qualifyRecordID(final URI base, final String key,
497             final String value)
498     throws IOException {
499         URI result;
500         Map<String, String> qualifiers = new HashMap<String, String>(1);
501         qualifiers.put(key, value);
502         try {
503             result = GeneratorFactory.getFactory().
504                 qualifyRecordID(base, qualifiers);
505         } catch (URISyntaxException e) {
506             throw new IOException(e.toString());
507         }
508         return result;
509     }  
510     
511     @Override
512     protected String getFirstrecordStylesheet() {
513         return "/warcinfobody.xsl";
514     }
515 
516     /***
517      * Return relevant values as header-like fields (here ANVLRecord, but 
518      * spec-defined "application/warc-fields" type when written). Field
519      * names from from DCMI Terms and the WARC/0.17 specification.
520      * 
521      * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
522      */
523     @Override
524     protected String getFirstrecordBody(File orderFile) {
525         ANVLRecord record = new ANVLRecord(7);
526         record.addLabelValue("software", "Heritrix/" +
527                 Heritrix.getVersion() + " http://crawler.archive.org");
528         try {
529             InetAddress host = InetAddress.getLocalHost();
530             record.addLabelValue("ip", host.getHostAddress());
531             record.addLabelValue("hostname", host.getHostName());
532         } catch (UnknownHostException e) {
533             logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
534         }
535         record.addLabelValue("format","WARC File Format 0.17");
536         record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc");
537         // Get other values from order.xml 
538         try {
539             Document doc = XmlUtils.getDocument(orderFile);
540             addIfNotBlank(record,"operator",
541                     XmlUtils.xpathOrNull(doc,"//meta/operator"));
542             addIfNotBlank(record,"publisher",
543                     XmlUtils.xpathOrNull(doc,"//meta/organization"));
544             addIfNotBlank(record,"audience",
545                     XmlUtils.xpathOrNull(doc,"//meta/audience"));
546             addIfNotBlank(record,"isPartOf",
547                     XmlUtils.xpathOrNull(doc,"//meta/name"));
548             String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
549             if(StringUtils.isNotBlank(rawDate)) {
550                 Date date;
551                 try {
552                     date = ArchiveUtils.parse14DigitDate(rawDate);
553                     addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
554                 } catch (ParseException e) {
555                     logger.log(Level.WARNING,"obtaining warc created date",e);
556                 }
557             }
558             addIfNotBlank(record,"description",
559                     XmlUtils.xpathOrNull(doc,"//meta/description"));
560             addIfNotBlank(record,"robots",
561                     XmlUtils.xpathOrNull(doc, 
562                             "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
563             addIfNotBlank(record,"http-header-user-agent",
564                     XmlUtils.xpathOrNull(doc, 
565                             "//map[@name='http-headers']/string[@name='user-agent']"));
566             addIfNotBlank(record,"http-header-from",
567                     XmlUtils.xpathOrNull(doc, 
568                             "//map[@name='http-headers']/string[@name='from']"));
569         } catch (IOException e) {
570             logger.log(Level.WARNING,"obtaining warcinfo",e);
571         } 
572         // really ugly to return as string, when it may just be merged with 
573         // a couple other fields at write time, but changing would require 
574         // larger refactoring
575         return record.toString();
576     }
577 
578 
579     protected void addIfNotBlank(ANVLRecord record, String label, String value) {
580         if(StringUtils.isNotBlank(value)) {
581             record.addLabelValue(label, value);
582         }
583     }
584 }