1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.io.arc;
26
27 import java.io.ByteArrayInputStream;
28 import java.io.ByteArrayOutputStream;
29 import java.io.IOException;
30 import java.io.InputStream;
31
32 import org.apache.commons.httpclient.Header;
33 import org.apache.commons.httpclient.HttpParser;
34 import org.apache.commons.httpclient.StatusLine;
35 import org.apache.commons.httpclient.util.EncodingUtil;
36 import org.archive.io.ArchiveRecord;
37 import org.archive.io.ArchiveRecordHeader;
38 import org.archive.io.RecoverableIOException;
39
40
41 /***
42 * An ARC file record.
43 * Does not compass the ARCRecord metadata line, just the record content.
44 * @author stack
45 */
46 public class ARCRecord extends ArchiveRecord implements ARCConstants {
47 /***
48 * Http status line object.
49 *
50 * May be null if record is not http.
51 */
52 private StatusLine httpStatus = null;
53
54 /***
55 * Http header bytes.
56 *
57 * If non-null and bytes available, give out its contents before we
58 * go back to the underlying stream.
59 */
60 private InputStream httpHeaderStream = null;
61
62 /***
63 * Http headers.
64 *
65 * Only populated after reading of headers.
66 */
67 private Header [] httpHeaders = null;
68
69
70 /***
71 * Minimal http header length.
72 *
73 * I've seen in arcs content length of 1 with no
74 * header.
75 */
76 private static final long MIN_HTTP_HEADER_LENGTH =
77 "HTTP/1.1 200 OK\r\n".length();
78
79 /***
80 * Constructor.
81 *
82 * @param in Stream cue'd up to be at the start of the record this instance
83 * is to represent.
84 * @param metaData Meta data.
85 * @throws IOException
86 */
87 public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
88 throws IOException {
89 this(in, metaData, 0, true, false, true);
90 }
91
92 /***
93 * Constructor.
94 *
95 * @param in Stream cue'd up to be at the start of the record this instance
96 * is to represent.
97 * @param metaData Meta data.
98 * @param bodyOffset Offset into the body. Usually 0.
99 * @param digest True if we're to calculate digest for this record. Not
100 * digesting saves about ~15% of cpu during an ARC parse.
101 * @param strict Be strict parsing (Parsing stops if ARC inproperly
102 * formatted).
103 * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
104 * about ~20% of CPU during an ARC parse.
105 * @throws IOException
106 */
107 public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
108 int bodyOffset, boolean digest, boolean strict,
109 final boolean parseHttpHeaders)
110 throws IOException {
111 super(in, metaData, bodyOffset, digest, strict);
112 if (parseHttpHeaders) {
113 this.httpHeaderStream = readHttpHeader();
114 }
115 }
116
117 /***
118 * Skip over the the http header if one present.
119 *
120 * Subsequent reads will get the body.
121 *
122 * <p>Calling this method in the midst of reading the header
123 * will make for strange results. Otherwise, safe to call
124 * at any time though before reading any of the arc record
125 * content is only time that it makes sense.
126 *
127 * <p>After calling this method, you can call
128 * {@link #getHttpHeaders()} to get the read http header.
129 *
130 * @throws IOException
131 */
132 public void skipHttpHeader() throws IOException {
133 if (this.httpHeaderStream != null) {
134
135 for (int available = this.httpHeaderStream.available();
136 this.httpHeaderStream != null &&
137 (available = this.httpHeaderStream.available()) > 0;) {
138
139
140 byte [] buffer = new byte[available];
141
142
143 read(buffer, 0, available);
144 }
145 }
146 }
147
148 public void dumpHttpHeader() throws IOException {
149 if (this.httpHeaderStream == null) {
150 return;
151 }
152
153 for (int available = this.httpHeaderStream.available();
154 this.httpHeaderStream != null
155 && (available = this.httpHeaderStream.available()) > 0;) {
156
157
158 byte[] buffer = new byte[available];
159
160
161 int read = read(buffer, 0, available);
162 System.out.write(buffer, 0, read);
163 }
164 }
165
166 /***
167 * Read http header if present. Technique borrowed from HttpClient HttpParse
168 * class.
169 *
170 * @return ByteArrayInputStream with the http header in it or null if no
171 * http header.
172 * @throws IOException
173 */
174 private InputStream readHttpHeader() throws IOException {
175
176
177 if(!getHeader().getUrl().startsWith("http") ||
178 getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
179 return null;
180 }
181 byte [] statusBytes = HttpParser.readRawLine(getIn());
182 int eolCharCount = getEolCharsCount(statusBytes);
183 if (eolCharCount <= 0) {
184 throw new IOException(
185 "Failed to read http status where one was expected: "
186 + ((statusBytes == null) ? "" : new String(statusBytes)));
187 }
188 String statusLine = EncodingUtil.getString(statusBytes, 0,
189 statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
190 if ((statusLine == null) ||
191 !StatusLine.startsWithHTTP(statusLine)) {
192 if (statusLine.startsWith("DELETED")) {
193
194
195
196
197
198
199
200
201 throw new DeletedARCRecordIOException(statusLine);
202 } else {
203 throw new IOException("Failed parse of http status line.");
204 }
205 }
206 this.httpStatus = new StatusLine(statusLine);
207
208
209
210
211
212 ByteArrayOutputStream baos =
213 new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
214 baos.write(statusBytes);
215
216
217
218 for (byte [] lineBytes = null; true;) {
219 lineBytes = HttpParser.readRawLine(getIn());
220 eolCharCount = getEolCharsCount(lineBytes);
221 if (eolCharCount <= 0) {
222 throw new IOException("Failed reading http headers: " +
223 ((lineBytes != null)? new String(lineBytes): null));
224 }
225
226 baos.write(lineBytes);
227 if ((lineBytes.length - eolCharCount) <= 0) {
228
229 break;
230 }
231 }
232
233 byte [] headerBytes = baos.toByteArray();
234
235 this.getMetaData().setContentBegin(headerBytes.length);
236 ByteArrayInputStream bais =
237 new ByteArrayInputStream(headerBytes);
238 if (!bais.markSupported()) {
239 throw new IOException("ByteArrayInputStream does not support mark");
240 }
241 bais.mark(headerBytes.length);
242
243
244 bais.read(statusBytes, 0, statusBytes.length);
245 this.httpHeaders = HttpParser.parseHeaders(bais,
246 ARCConstants.DEFAULT_ENCODING);
247 this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
248 bais.reset();
249 return bais;
250 }
251
252 private static class DeletedARCRecordIOException
253 extends RecoverableIOException {
254 public DeletedARCRecordIOException(final String reason) {
255 super(reason);
256 }
257 }
258
259 /***
260 * Return status code for this record.
261 *
262 * This method will return -1 until the http header has been read.
263 * @return Status code.
264 */
265 public int getStatusCode() {
266 return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
267 }
268
269 /***
270 * @param bytes Array of bytes to examine for an EOL.
271 * @return Count of end-of-line characters or zero if none.
272 */
273 private int getEolCharsCount(byte [] bytes) {
274 int count = 0;
275 if (bytes != null && bytes.length >=1 &&
276 bytes[bytes.length - 1] == '\n') {
277 count++;
278 if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
279 count++;
280 }
281 }
282 return count;
283 }
284
285 /***
286 * @return Meta data for this record.
287 */
288 public ARCRecordMetaData getMetaData() {
289 return (ARCRecordMetaData)getHeader();
290 }
291
292 /***
293 * @return http headers (Only available after header has been read).
294 */
295 public Header [] getHttpHeaders() {
296 return this.httpHeaders;
297 }
298
299 /***
300 * @return Next character in this ARCRecord's content else -1 if at end of
301 * this record.
302 * @throws IOException
303 */
304 public int read() throws IOException {
305 int c = -1;
306 if (this.httpHeaderStream != null &&
307 (this.httpHeaderStream.available() > 0)) {
308
309
310 c = this.httpHeaderStream.read();
311
312 if (this.httpHeaderStream.available() <= 0) {
313 this.httpHeaderStream = null;
314 }
315 incrementPosition();
316 } else {
317 c = super.read();
318 }
319 return c;
320 }
321
322 public int read(byte [] b, int offset, int length) throws IOException {
323 int read = -1;
324 if (this.httpHeaderStream != null &&
325 (this.httpHeaderStream.available() > 0)) {
326
327
328 read = Math.min(length, this.httpHeaderStream.available());
329 if (read == 0) {
330 read = -1;
331 } else {
332 read = this.httpHeaderStream.read(b, offset, read);
333 }
334
335 if (this.httpHeaderStream.available() <= 0) {
336 this.httpHeaderStream = null;
337 }
338 incrementPosition(read);
339 } else {
340 read = super.read(b, offset, length);
341 }
342 return read;
343 }
344
345 /***
346 * @return Offset at which the body begins (Only known after
347 * header has been read) or -1 if none or if we haven't read
348 * headers yet. Usually length of HTTP headers (does not include ARC
349 * metadata line length).
350 */
351 public int getBodyOffset() {
352 return this.getMetaData().getContentBegin();
353 }
354
355 @Override
356 protected String getIp4Cdx(ArchiveRecordHeader h) {
357 String result = null;
358 if (h instanceof ARCRecordMetaData) {
359 result = ((ARCRecordMetaData)h).getIp();
360 }
361 return (result != null)? result: super.getIp4Cdx(h);
362 }
363
364 @Override
365 protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
366 String result = null;
367 if (h instanceof ARCRecordMetaData) {
368 result = ((ARCRecordMetaData) h).getStatusCode();
369 }
370 return (result != null) ? result: super.getStatusCode4Cdx(h);
371 }
372
373 @Override
374 protected String getDigest4Cdx(ArchiveRecordHeader h) {
375 String result = null;
376 if (h instanceof ARCRecordMetaData) {
377 result = ((ARCRecordMetaData) h).getDigest();
378 }
379 return (result != null) ? result: super.getDigest4Cdx(h);
380 }
381 }