View Javadoc

1   /* ARCRecord
2    *
3    * $Id: ARCRecord.java 5943 2008-08-01 23:01:27Z gojomo $
4    *
5    * Created on Jan 7, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.ByteArrayInputStream;
28  import java.io.ByteArrayOutputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  
32  import org.apache.commons.httpclient.Header;
33  import org.apache.commons.httpclient.HttpParser;
34  import org.apache.commons.httpclient.StatusLine;
35  import org.apache.commons.httpclient.util.EncodingUtil;
36  import org.archive.io.ArchiveRecord;
37  import org.archive.io.ArchiveRecordHeader;
38  import org.archive.io.RecoverableIOException;
39  
40  
41  /***
42   * An ARC file record.
43   * Does not compass the ARCRecord metadata line, just the record content.
44   * @author stack
45   */
46  public class ARCRecord extends ArchiveRecord implements ARCConstants {
47      /***
48       * Http status line object.
49       * 
50       * May be null if record is not http.
51       */
52      private StatusLine httpStatus = null;
53  
54      /***
55       * Http header bytes.
56       * 
57       * If non-null and bytes available, give out its contents before we
58       * go back to the underlying stream.
59       */
60      private InputStream httpHeaderStream = null;
61      
62      /***
63       * Http headers.
64       * 
65       * Only populated after reading of headers.
66       */
67      private Header [] httpHeaders = null;
68  
69      
70      /***
71       * Minimal http header length.
72       * 
73       * I've seen in arcs content length of 1 with no 
74       * header.
75       */
76      private static final long MIN_HTTP_HEADER_LENGTH =
77          "HTTP/1.1 200 OK\r\n".length();
78      
79      /***
80       * Constructor.
81       *
82       * @param in Stream cue'd up to be at the start of the record this instance
83       * is to represent.
84       * @param metaData Meta data.
85       * @throws IOException
86       */
87      public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
88      		throws IOException {
89          this(in, metaData, 0, true, false, true);
90      }
91  
92      /***
93       * Constructor.
94       *
95       * @param in Stream cue'd up to be at the start of the record this instance
96       * is to represent.
97       * @param metaData Meta data.
98       * @param bodyOffset Offset into the body.  Usually 0.
99       * @param digest True if we're to calculate digest for this record.  Not
100      * digesting saves about ~15% of cpu during an ARC parse.
101      * @param strict Be strict parsing (Parsing stops if ARC inproperly
102      * formatted).
103      * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
104      * about ~20% of CPU during an ARC parse.
105      * @throws IOException
106      */
107     public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
108         int bodyOffset, boolean digest, boolean strict,
109         final boolean parseHttpHeaders) 
110     throws IOException {
111     	super(in, metaData, bodyOffset, digest, strict);
112         if (parseHttpHeaders) {
113             this.httpHeaderStream = readHttpHeader();
114         }
115     }
116     
117     /***
118      * Skip over the the http header if one present.
119      * 
120      * Subsequent reads will get the body.
121      * 
122      * <p>Calling this method in the midst of reading the header
123      * will make for strange results.  Otherwise, safe to call
124      * at any time though before reading any of the arc record
125      * content is only time that it makes sense.
126      * 
127      * <p>After calling this method, you can call
128      * {@link #getHttpHeaders()} to get the read http header.
129      * 
130      * @throws IOException
131      */
132     public void skipHttpHeader() throws IOException {
133         if (this.httpHeaderStream != null) {
134             // Empty the httpHeaderStream
135             for (int available = this.httpHeaderStream.available();
136             		this.httpHeaderStream != null &&
137             			(available = this.httpHeaderStream.available()) > 0;) {
138                 // We should be in this loop once only we should only do this
139                 // buffer allocation once.
140                 byte [] buffer = new byte[available];
141                 // The read nulls out httpHeaderStream when done with it so
142                 // need check for null in the loop control line.
143                 read(buffer, 0, available);
144             }
145         }
146     }
147     
148     public void dumpHttpHeader() throws IOException {
149 		if (this.httpHeaderStream == null) {
150 			return;
151 		}
152 		// Dump the httpHeaderStream to STDOUT
153 		for (int available = this.httpHeaderStream.available();
154 			this.httpHeaderStream != null
155 				&& (available = this.httpHeaderStream.available()) > 0;) {
156 			// We should be in this loop only once and should do this
157 			// buffer allocation once.
158 			byte[] buffer = new byte[available];
159 			// The read nulls out httpHeaderStream when done with it so
160 			// need check for null in the loop control line.
161 			int read = read(buffer, 0, available);
162 			System.out.write(buffer, 0, read);
163 		}
164 	}
165     
166     /***
167 	 * Read http header if present. Technique borrowed from HttpClient HttpParse
168 	 * class.
169 	 * 
170 	 * @return ByteArrayInputStream with the http header in it or null if no
171 	 *         http header.
172 	 * @throws IOException
173 	 */
174     private InputStream readHttpHeader() throws IOException {
175         // If judged a record that doesn't have an http header, return
176         // immediately.
177         if(!getHeader().getUrl().startsWith("http") ||
178             getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
179             return null;
180         }
181         byte [] statusBytes = HttpParser.readRawLine(getIn());
182         int eolCharCount = getEolCharsCount(statusBytes);
183         if (eolCharCount <= 0) {
184             throw new IOException(
185                 "Failed to read http status where one was expected: " 
186                 + ((statusBytes == null) ? "" : new String(statusBytes)));
187         }
188         String statusLine = EncodingUtil.getString(statusBytes, 0,
189             statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
190         if ((statusLine == null) ||
191                 !StatusLine.startsWithHTTP(statusLine)) {
192             if (statusLine.startsWith("DELETED")) {
193                 // Some old ARCs have deleted records like following:
194                 // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
195                 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
196                 // (follows ~29K spaces)
197                 // For now, throw a RecoverableIOException so if iterating over
198                 // records, we keep going.  TODO: Later make a legitimate
199                 // ARCRecord from the deleted record rather than throw
200                 // exception.
201                 throw new DeletedARCRecordIOException(statusLine);
202             } else {
203                 throw new IOException("Failed parse of http status line.");
204             }
205         }
206         this.httpStatus = new StatusLine(statusLine);
207         
208         // Save off all bytes read.  Keep them as bytes rather than
209         // convert to strings so we don't have to worry about encodings
210         // though this should never be a problem doing http headers since
211         // its all supposed to be ascii.
212         ByteArrayOutputStream baos =
213             new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
214         baos.write(statusBytes);
215         
216         // Now read rest of the header lines looking for the separation
217         // between header and body.
218         for (byte [] lineBytes = null; true;) {
219             lineBytes = HttpParser.readRawLine(getIn());
220             eolCharCount = getEolCharsCount(lineBytes);
221             if (eolCharCount <= 0) {
222                 throw new IOException("Failed reading http headers: " +
223                     ((lineBytes != null)? new String(lineBytes): null));
224             }
225             // Save the bytes read.
226             baos.write(lineBytes);
227             if ((lineBytes.length - eolCharCount) <= 0) {
228                 // We've finished reading the http header.
229                 break;
230             }
231         }
232         
233         byte [] headerBytes = baos.toByteArray();
234         // Save off where body starts.
235         this.getMetaData().setContentBegin(headerBytes.length);
236         ByteArrayInputStream bais =
237             new ByteArrayInputStream(headerBytes);
238         if (!bais.markSupported()) {
239             throw new IOException("ByteArrayInputStream does not support mark");
240         }
241         bais.mark(headerBytes.length);
242         // Read the status line.  Don't let it into the parseHeaders function.
243         // It doesn't know what to do with it.
244         bais.read(statusBytes, 0, statusBytes.length);
245         this.httpHeaders = HttpParser.parseHeaders(bais,
246             ARCConstants.DEFAULT_ENCODING);
247         this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
248         bais.reset();
249         return bais;
250     }
251     
252     private static class DeletedARCRecordIOException
253     extends RecoverableIOException {
254         public DeletedARCRecordIOException(final String reason) {
255             super(reason);
256         }
257     }
258     
259     /***
260      * Return status code for this record.
261      * 
262      * This method will return -1 until the http header has been read.
263      * @return Status code.
264      */
265     public int getStatusCode() {
266         return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
267     }
268     
269     /***
270      * @param bytes Array of bytes to examine for an EOL.
271      * @return Count of end-of-line characters or zero if none.
272      */
273     private int getEolCharsCount(byte [] bytes) {
274         int count = 0;
275         if (bytes != null && bytes.length >=1 &&
276                 bytes[bytes.length - 1] == '\n') {
277             count++;
278             if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
279                 count++;
280             }
281         }
282         return count;
283     }
284 
285     /***
286      * @return Meta data for this record.
287      */
288     public ARCRecordMetaData getMetaData() {
289         return (ARCRecordMetaData)getHeader();
290     }
291     
292     /***
293      * @return http headers (Only available after header has been read).
294      */
295     public Header [] getHttpHeaders() {
296         return this.httpHeaders;
297     }
298 
299     /***
300      * @return Next character in this ARCRecord's content else -1 if at end of
301      * this record.
302      * @throws IOException
303      */
304     public int read() throws IOException {
305         int c = -1;
306         if (this.httpHeaderStream != null &&
307                 (this.httpHeaderStream.available() > 0)) {
308             // If http header, return bytes from it before we go to underlying
309             // stream.
310             c = this.httpHeaderStream.read();
311             // If done with the header stream, null it out.
312             if (this.httpHeaderStream.available() <= 0) {
313                 this.httpHeaderStream = null;
314             }
315             incrementPosition();
316         } else {
317             c = super.read();
318         }
319         return c;
320     }
321 
322     public int read(byte [] b, int offset, int length) throws IOException {
323         int read = -1;
324         if (this.httpHeaderStream != null &&
325                 (this.httpHeaderStream.available() > 0)) {
326             // If http header, return bytes from it before we go to underlying
327             // stream.
328             read = Math.min(length, this.httpHeaderStream.available());
329             if (read == 0) {
330                 read = -1;
331             } else {
332                 read = this.httpHeaderStream.read(b, offset, read);
333             }
334             // If done with the header stream, null it out.
335             if (this.httpHeaderStream.available() <= 0) {
336                 this.httpHeaderStream = null;
337             }
338             incrementPosition(read);
339         } else {
340             read = super.read(b, offset, length);
341         }
342         return read;
343     }
344 
345     /***
346      * @return Offset at which the body begins (Only known after
347      * header has been read) or -1 if none or if we haven't read
348      * headers yet.  Usually length of HTTP headers (does not include ARC
349      * metadata line length).
350      */
351     public int getBodyOffset() {
352         return this.getMetaData().getContentBegin();
353     }
354     
355     @Override
356     protected String getIp4Cdx(ArchiveRecordHeader h) {
357     	String result = null;
358     	if (h instanceof ARCRecordMetaData) {
359     		result = ((ARCRecordMetaData)h).getIp();
360     	}
361     	return (result != null)? result: super.getIp4Cdx(h);
362     }
363     
364     @Override
365 	protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
366 		String result = null;
367 		if (h instanceof ARCRecordMetaData) {
368 			result = ((ARCRecordMetaData) h).getStatusCode();
369 		}
370 		return (result != null) ? result: super.getStatusCode4Cdx(h);
371 	}
372     
373     @Override
374 	protected String getDigest4Cdx(ArchiveRecordHeader h) {
375 		String result = null;
376 		if (h instanceof ARCRecordMetaData) {
377 			result = ((ARCRecordMetaData) h).getDigest();
378 		}
379 		return (result != null) ? result: super.getDigest4Cdx(h);
380 	}
381 }