1   /* ByteReplayCharSequenceFactory
2    *
3    * (Re)Created on Dec 21, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.IOException;
26  import java.io.RandomAccessFile;
27  import java.io.UnsupportedEncodingException;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.archive.util.DevUtils;
32  
33  /***
34   * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
35   * buffer and overflow backing file).
36   *
37   * Assumes the byte stream is ISO-8859-1 text, taking advantage of the fact 
38   * that each byte in the stream corresponds to a single unicode character with
39   * the same numerical value as the byte. 
40   *
41   * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
42   * from disk in memory; as long as the 'random access' of a CharSequence
43   * user stays within this window, access should remain fairly efficient.
44   * (So design any regexps pointed at these CharSequences to work within
45   * that range!)
46   *
47   * <p>When rereading of a location is necessary, the whole window is
48   * recentered around the location requested. (TODO: More research
49   * into whether this is the best strategy.)
50   *
51   * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
52   * to wrap the passed prefix buffer and the second, a memory-mapped
53   * ByteBuffer view into the backing file -- was consistently slower: ~10%.
54   * My tests did the following. Made a buffer filled w/ regular content.
55   * This buffer was used as the prefix buffer.  The buffer content was
56   * written MULTIPLER times to a backing file.  I then did accesses w/ the
57   * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
58   * read forward from byte 16-32.  Repeat.  Though I varied the size of the
59   * buffer to the size of the backing file,from 3-10, the difference of 10%
60   * or so seemed to persist.  Same if I tried to favor get() over get(index).
61   * I used a profiler, JMP, to study times taken (St.Ack did above comment).
62   *
63   * <p>TODO determine in memory mapped files is better way to do this;
64   * probably not -- they don't offer the level of control over
65   * total memory used that this approach does.
66   *
67   * @author Gordon Mohr
68   * @version $Revision: 5849 $, $Date: 2008-06-28 01:29:33 +0000 (Sat, 28 Jun 2008) $
69   */
70  class Latin1ByteReplayCharSequence implements ReplayCharSequence {
71  
72      protected static Logger logger =
73          Logger.getLogger(Latin1ByteReplayCharSequence.class.getName());
74  
75      /***
76       * Buffer that holds the first bit of content.
77       *
78       * Once this is exhausted we go to the backing file.
79       */
80      private byte[] prefixBuffer;
81  
82      /***
83       * Total length of character stream to replay minus the HTTP headers
84       * if present.
85       *
86       * Used to find EOS.
87       */
88      protected int length;
89  
90      /***
91       * Absolute length of the stream.
92       *
93       * Includes HTTP headers.  Needed doing calc. in the below figuring
94       * how much to load into buffer.
95       */
96      private int absoluteLength = -1;
97  
98      /***
99       * Buffer window on to backing file.
100      */
101     private byte[] wraparoundBuffer;
102 
103     /***
104      * Absolute index into underlying bytestream where wrap starts.
105      */
106     private int wrapOrigin;
107 
108     /***
109      * Index in wraparoundBuffer that corresponds to wrapOrigin
110      */
111     private int wrapOffset;
112 
113     /***
114      * Name of backing file we go to when we've exhausted content from the
115      * prefix buffer.
116      */
117     private String backingFilename;
118 
119     /***
120      * Random access to the backing file.
121      */
122     private RandomAccessFile raFile;
123 
124     /***
125      * Offset into prefix buffer at which content beings.
126      */
127     private int contentOffset;
128 
129     /***
130      * 8-bit encoding used reading single bytes from buffer and
131      * stream.
132      */
133     private static final String DEFAULT_SINGLE_BYTE_ENCODING =
134         "ISO-8859-1";
135 
136 
137     /***
138      * Constructor.
139      *
140      * @param buffer In-memory buffer of recordings prefix.  We read from
141      * here first and will only go to the backing file if <code>size</code>
142      * requested is greater than <code>buffer.length</code>.
143      * @param size Total size of stream to replay in bytes.  Used to find
144      * EOS. This is total length of content including HTTP headers if
145      * present.
146      * @param responseBodyStart Where the response body starts in bytes.
147      * Used to skip over the HTTP headers if present.
148      * @param backingFilename Path to backing file with content in excess of
149      * whats in <code>buffer</code>.
150      *
151      * @throws IOException
152      */
153     public Latin1ByteReplayCharSequence(byte[] buffer, long size,
154             long responseBodyStart, String backingFilename)
155         throws IOException {
156 
157         this.length = (int)(size - responseBodyStart);
158         this.absoluteLength = (int)size;
159         this.prefixBuffer = buffer;
160         this.contentOffset = (int)responseBodyStart;
161 
162         // If amount to read is > than what is in our prefix buffer, then
163         // open the backing file.
164         if (size > buffer.length) {
165             this.backingFilename = backingFilename;
166             this.raFile = new RandomAccessFile(backingFilename, "r");
167             this.wraparoundBuffer = new byte[this.prefixBuffer.length];
168             this.wrapOrigin = this.prefixBuffer.length;
169             this.wrapOffset = 0;
170             loadBuffer();
171         }
172     }
173 
174     /***
175      * @return Length of characters in stream to replay.  Starts counting
176      * at the HTTP header/body boundary.
177      */
178     public int length() {
179         return this.length;
180     }
181 
182     /***
183      * Get character at passed absolute position.
184      *
185      * Called by {@link #charAt(int)} which has a relative index into the
186      * content, one that doesn't account for HTTP header if present.
187      *
188      * @param index Index into content adjusted to accomodate initial offset
189      * to get us past the HTTP header if present (i.e.
190      * {@link #contentOffset}).
191      *
192      * @return Characater at offset <code>index</code>.
193      */
194     public char charAt(int index) {
195         int c = -1;
196         // Add to index start-of-content offset to get us over HTTP header
197         // if present.
198         index += this.contentOffset;
199         if (index < this.prefixBuffer.length) {
200             // If index is into our prefix buffer.
201             c = this.prefixBuffer[index];
202         } else if (index >= this.wrapOrigin &&
203             (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
204             // If index is into our buffer window on underlying backing file.
205             c = this.wraparoundBuffer[
206                     ((index - this.wrapOrigin) + this.wrapOffset) %
207                         this.wraparoundBuffer.length];
208         } else {
209             // Index is outside of both prefix buffer and our buffer window
210             // onto the underlying backing file.  Fix the buffer window
211             // location.
212             c = faultCharAt(index);
213         }
214         // Stream is treated as single byte.  Make sure characters returned
215         // are not negative.
216         return (char)(c & 0xff);
217     }
218 
219     /***
220      * Get a character that's outside the current buffers.
221      *
222      * will cause the wraparoundBuffer to be changed to
223      * cover a region including the index
224      *
225      * if index is higher than the highest index in the
226      * wraparound buffer, buffer is moved forward such
227      * that requested char is last item in buffer
228      *
229      * if index is lower than lowest index in the
230      * wraparound buffer, buffet is reset centered around
231      * index
232      *
233      * @param index Index of character to fetch.
234      * @return A character that's outside the current buffers
235      */
236     private int faultCharAt(int index) {
237         if(Thread.interrupted()) {
238             throw new RuntimeException("thread interrupted");
239         }
240         if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
241             // Moving forward
242             while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
243             {
244                 // TODO optimize this
245                 advanceBuffer();
246             }
247             return charAt(index - this.contentOffset);
248         }
249         // Moving backward
250         recenterBuffer(index);
251         return charAt(index - this.contentOffset);
252     }
253 
254     /***
255      * Move the buffer window on backing file back centering current access
256      * position in middle of window.
257      *
258      * @param index Index of character to access.
259      */
260     private void recenterBuffer(int index) {
261         if (logger.isLoggable(Level.FINE)) {
262             logger.fine("Recentering around " + index + " in " +
263                 this.backingFilename);
264         }
265         this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
266         if(this.wrapOrigin < this.prefixBuffer.length) {
267             this.wrapOrigin = this.prefixBuffer.length;
268         }
269         this.wrapOffset = 0;
270         loadBuffer();
271     }
272 
273     /***
274      * Load from backing file into the wrapper buffer.
275      */
276     private void loadBuffer()
277     {
278         long len = -1;
279         try {
280             len = this.raFile.length();
281             this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
282             this.raFile.readFully(this.wraparoundBuffer, 0,
283                 Math.min(this.wraparoundBuffer.length,
284                      this.absoluteLength - this.wrapOrigin));
285         }
286 
287         catch (IOException e) {
288             // TODO convert this to a runtime error?
289             DevUtils.logger.log (
290                 Level.SEVERE,
291                 "raFile.seek(" +
292                 (this.wrapOrigin - this.prefixBuffer.length) +
293                 ")\n" +
294                 "raFile.readFully(wraparoundBuffer,0," +
295                 (Math.min(this.wraparoundBuffer.length,
296                     this.length - this.wrapOrigin )) +
297                 ")\n"+
298                 "raFile.length()" + len + "\n" +
299                 DevUtils.extraInfo(),
300                 e);
301             throw new RuntimeException(e);
302         }
303     }
304 
305     /***
306      * Roll the wraparound buffer forward one position
307      */
308     private void advanceBuffer() {
309         try {
310             this.wraparoundBuffer[this.wrapOffset] =
311                 (byte)this.raFile.read();
312             this.wrapOffset++;
313             this.wrapOffset %= this.wraparoundBuffer.length;
314             this.wrapOrigin++;
315         } catch (IOException e) {
316             DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
317                 DevUtils.extraInfo(), e);
318             throw new RuntimeException(e);
319         }
320     }
321 
322     public CharSequence subSequence(int start, int end) {
323         return new CharSubSequence(this, start, end);
324     }
325 
326     /***
327      * Cleanup resources.
328      *
329      * @exception IOException Failed close of random access file.
330      */
331     public void close() throws IOException
332     {
333         this.prefixBuffer = null;
334         if (this.raFile != null) {
335             this.raFile.close();
336             this.raFile = null;
337         }
338     }
339 
340     /* (non-Javadoc)
341      * @see java.lang.Object#finalize()
342      */
343     protected void finalize() throws Throwable
344     {
345         super.finalize();
346         close();
347     }
348     
349     /***
350      * Convenience method for getting a substring. 
351      * @deprecated please use subSequence() and then toString() directly 
352      */
353     public String substring(int offset, int len) {
354         return subSequence(offset, offset+len).toString();
355     }
356 
357     /* (non-Javadoc)
358      * @see java.lang.Object#toString()
359      */
360     public String toString() {
361         StringBuilder sb = new StringBuilder(this.length());
362         sb.append(this);
363         return sb.toString();
364     }
365 }