1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.IOException;
26 import java.io.RandomAccessFile;
27 import java.io.UnsupportedEncodingException;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.archive.util.DevUtils;
32
33 /***
34 * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
35 * buffer and overflow backing file).
36 *
37 * Assumes the byte stream is ISO-8859-1 text, taking advantage of the fact
38 * that each byte in the stream corresponds to a single unicode character with
39 * the same numerical value as the byte.
40 *
41 * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
42 * from disk in memory; as long as the 'random access' of a CharSequence
43 * user stays within this window, access should remain fairly efficient.
44 * (So design any regexps pointed at these CharSequences to work within
45 * that range!)
46 *
47 * <p>When rereading of a location is necessary, the whole window is
48 * recentered around the location requested. (TODO: More research
49 * into whether this is the best strategy.)
50 *
51 * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
52 * to wrap the passed prefix buffer and the second, a memory-mapped
53 * ByteBuffer view into the backing file -- was consistently slower: ~10%.
54 * My tests did the following. Made a buffer filled w/ regular content.
55 * This buffer was used as the prefix buffer. The buffer content was
56 * written MULTIPLER times to a backing file. I then did accesses w/ the
57 * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
58 * read forward from byte 16-32. Repeat. Though I varied the size of the
59 * buffer to the size of the backing file,from 3-10, the difference of 10%
60 * or so seemed to persist. Same if I tried to favor get() over get(index).
61 * I used a profiler, JMP, to study times taken (St.Ack did above comment).
62 *
63 * <p>TODO determine in memory mapped files is better way to do this;
64 * probably not -- they don't offer the level of control over
65 * total memory used that this approach does.
66 *
67 * @author Gordon Mohr
68 * @version $Revision: 5849 $, $Date: 2008-06-28 01:29:33 +0000 (Sat, 28 Jun 2008) $
69 */
70 class Latin1ByteReplayCharSequence implements ReplayCharSequence {
71
72 protected static Logger logger =
73 Logger.getLogger(Latin1ByteReplayCharSequence.class.getName());
74
75 /***
76 * Buffer that holds the first bit of content.
77 *
78 * Once this is exhausted we go to the backing file.
79 */
80 private byte[] prefixBuffer;
81
82 /***
83 * Total length of character stream to replay minus the HTTP headers
84 * if present.
85 *
86 * Used to find EOS.
87 */
88 protected int length;
89
90 /***
91 * Absolute length of the stream.
92 *
93 * Includes HTTP headers. Needed doing calc. in the below figuring
94 * how much to load into buffer.
95 */
96 private int absoluteLength = -1;
97
98 /***
99 * Buffer window on to backing file.
100 */
101 private byte[] wraparoundBuffer;
102
103 /***
104 * Absolute index into underlying bytestream where wrap starts.
105 */
106 private int wrapOrigin;
107
108 /***
109 * Index in wraparoundBuffer that corresponds to wrapOrigin
110 */
111 private int wrapOffset;
112
113 /***
114 * Name of backing file we go to when we've exhausted content from the
115 * prefix buffer.
116 */
117 private String backingFilename;
118
119 /***
120 * Random access to the backing file.
121 */
122 private RandomAccessFile raFile;
123
124 /***
125 * Offset into prefix buffer at which content beings.
126 */
127 private int contentOffset;
128
129 /***
130 * 8-bit encoding used reading single bytes from buffer and
131 * stream.
132 */
133 private static final String DEFAULT_SINGLE_BYTE_ENCODING =
134 "ISO-8859-1";
135
136
137 /***
138 * Constructor.
139 *
140 * @param buffer In-memory buffer of recordings prefix. We read from
141 * here first and will only go to the backing file if <code>size</code>
142 * requested is greater than <code>buffer.length</code>.
143 * @param size Total size of stream to replay in bytes. Used to find
144 * EOS. This is total length of content including HTTP headers if
145 * present.
146 * @param responseBodyStart Where the response body starts in bytes.
147 * Used to skip over the HTTP headers if present.
148 * @param backingFilename Path to backing file with content in excess of
149 * whats in <code>buffer</code>.
150 *
151 * @throws IOException
152 */
153 public Latin1ByteReplayCharSequence(byte[] buffer, long size,
154 long responseBodyStart, String backingFilename)
155 throws IOException {
156
157 this.length = (int)(size - responseBodyStart);
158 this.absoluteLength = (int)size;
159 this.prefixBuffer = buffer;
160 this.contentOffset = (int)responseBodyStart;
161
162
163
164 if (size > buffer.length) {
165 this.backingFilename = backingFilename;
166 this.raFile = new RandomAccessFile(backingFilename, "r");
167 this.wraparoundBuffer = new byte[this.prefixBuffer.length];
168 this.wrapOrigin = this.prefixBuffer.length;
169 this.wrapOffset = 0;
170 loadBuffer();
171 }
172 }
173
174 /***
175 * @return Length of characters in stream to replay. Starts counting
176 * at the HTTP header/body boundary.
177 */
178 public int length() {
179 return this.length;
180 }
181
182 /***
183 * Get character at passed absolute position.
184 *
185 * Called by {@link #charAt(int)} which has a relative index into the
186 * content, one that doesn't account for HTTP header if present.
187 *
188 * @param index Index into content adjusted to accomodate initial offset
189 * to get us past the HTTP header if present (i.e.
190 * {@link #contentOffset}).
191 *
192 * @return Characater at offset <code>index</code>.
193 */
194 public char charAt(int index) {
195 int c = -1;
196
197
198 index += this.contentOffset;
199 if (index < this.prefixBuffer.length) {
200
201 c = this.prefixBuffer[index];
202 } else if (index >= this.wrapOrigin &&
203 (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
204
205 c = this.wraparoundBuffer[
206 ((index - this.wrapOrigin) + this.wrapOffset) %
207 this.wraparoundBuffer.length];
208 } else {
209
210
211
212 c = faultCharAt(index);
213 }
214
215
216 return (char)(c & 0xff);
217 }
218
219 /***
220 * Get a character that's outside the current buffers.
221 *
222 * will cause the wraparoundBuffer to be changed to
223 * cover a region including the index
224 *
225 * if index is higher than the highest index in the
226 * wraparound buffer, buffer is moved forward such
227 * that requested char is last item in buffer
228 *
229 * if index is lower than lowest index in the
230 * wraparound buffer, buffet is reset centered around
231 * index
232 *
233 * @param index Index of character to fetch.
234 * @return A character that's outside the current buffers
235 */
236 private int faultCharAt(int index) {
237 if(Thread.interrupted()) {
238 throw new RuntimeException("thread interrupted");
239 }
240 if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
241
242 while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
243 {
244
245 advanceBuffer();
246 }
247 return charAt(index - this.contentOffset);
248 }
249
250 recenterBuffer(index);
251 return charAt(index - this.contentOffset);
252 }
253
254 /***
255 * Move the buffer window on backing file back centering current access
256 * position in middle of window.
257 *
258 * @param index Index of character to access.
259 */
260 private void recenterBuffer(int index) {
261 if (logger.isLoggable(Level.FINE)) {
262 logger.fine("Recentering around " + index + " in " +
263 this.backingFilename);
264 }
265 this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
266 if(this.wrapOrigin < this.prefixBuffer.length) {
267 this.wrapOrigin = this.prefixBuffer.length;
268 }
269 this.wrapOffset = 0;
270 loadBuffer();
271 }
272
273 /***
274 * Load from backing file into the wrapper buffer.
275 */
276 private void loadBuffer()
277 {
278 long len = -1;
279 try {
280 len = this.raFile.length();
281 this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
282 this.raFile.readFully(this.wraparoundBuffer, 0,
283 Math.min(this.wraparoundBuffer.length,
284 this.absoluteLength - this.wrapOrigin));
285 }
286
287 catch (IOException e) {
288
289 DevUtils.logger.log (
290 Level.SEVERE,
291 "raFile.seek(" +
292 (this.wrapOrigin - this.prefixBuffer.length) +
293 ")\n" +
294 "raFile.readFully(wraparoundBuffer,0," +
295 (Math.min(this.wraparoundBuffer.length,
296 this.length - this.wrapOrigin )) +
297 ")\n"+
298 "raFile.length()" + len + "\n" +
299 DevUtils.extraInfo(),
300 e);
301 throw new RuntimeException(e);
302 }
303 }
304
305 /***
306 * Roll the wraparound buffer forward one position
307 */
308 private void advanceBuffer() {
309 try {
310 this.wraparoundBuffer[this.wrapOffset] =
311 (byte)this.raFile.read();
312 this.wrapOffset++;
313 this.wrapOffset %= this.wraparoundBuffer.length;
314 this.wrapOrigin++;
315 } catch (IOException e) {
316 DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
317 DevUtils.extraInfo(), e);
318 throw new RuntimeException(e);
319 }
320 }
321
322 public CharSequence subSequence(int start, int end) {
323 return new CharSubSequence(this, start, end);
324 }
325
326 /***
327 * Cleanup resources.
328 *
329 * @exception IOException Failed close of random access file.
330 */
331 public void close() throws IOException
332 {
333 this.prefixBuffer = null;
334 if (this.raFile != null) {
335 this.raFile.close();
336 this.raFile = null;
337 }
338 }
339
340
341
342
343 protected void finalize() throws Throwable
344 {
345 super.finalize();
346 close();
347 }
348
349 /***
350 * Convenience method for getting a substring.
351 * @deprecated please use subSequence() and then toString() directly
352 */
353 public String substring(int offset, int len) {
354 return subSequence(offset, offset+len).toString();
355 }
356
357
358
359
360 public String toString() {
361 StringBuilder sb = new StringBuilder(this.length());
362 sb.append(this);
363 return sb.toString();
364 }
365 }