1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.util.Vector;
32 import java.util.logging.Logger;
33 import java.util.regex.Matcher;
34
35 import org.apache.commons.io.IOUtils;
36 import org.archive.crawler.datamodel.CoreAttributeConstants;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.framework.CrawlController;
39 import org.archive.util.TextUtils;
40
41 import com.anotherbigidea.flash.interfaces.SWFActions;
42 import com.anotherbigidea.flash.interfaces.SWFTagTypes;
43 import com.anotherbigidea.flash.interfaces.SWFTags;
44 import com.anotherbigidea.flash.readers.ActionParser;
45 import com.anotherbigidea.flash.readers.SWFReader;
46 import com.anotherbigidea.flash.readers.TagParser;
47 import com.anotherbigidea.flash.structs.AlphaTransform;
48 import com.anotherbigidea.flash.structs.Matrix;
49 import com.anotherbigidea.flash.writers.SWFActionsImpl;
50 import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
51 import com.anotherbigidea.io.InStream;
52
53 /***
54 * Process SWF (flash/shockwave) files for strings that are likely to be
55 * crawlable URIs.
56 *
57 * @author Igor Ranitovic
58 */
59 public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
60
61 private static final long serialVersionUID = 3627359592408010589L;
62
63 private static Logger logger = Logger.getLogger(ExtractorSWF.class
64 .getName());
65
66 protected long numberOfCURIsHandled = 0;
67
68 protected long numberOfLinksExtracted = 0;
69
70
71
72 private static final int MAX_READ_SIZE = 1024 * 1024;
73
74 /***
75 * @param name
76 */
77 public ExtractorSWF(String name) {
78 super(name, "Flash extractor. Extracts URIs from SWF "
79 + "(flash/shockwave) files.");
80 }
81
82 protected void extract(CrawlURI curi) {
83 if (!isHttpTransactionContentToProcess(curi)) {
84 return;
85 }
86
87 String contentType = curi.getContentType();
88 if (contentType == null) {
89 return;
90 }
91 if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
92 && (!curi.toString().toLowerCase().endsWith(".swf"))) {
93 return;
94 }
95
96 InputStream documentStream = null;
97 try {
98 documentStream =
99 curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
100
101
102 SWFReader reader = getSWFReader(curi, documentStream);
103 if (reader == null) {
104 return;
105 }
106
107 numberOfCURIsHandled++;
108
109 reader.readFile();
110 } catch (IOException e) {
111 curi.addLocalizedError(getName(), e, "failed reading");
112 } catch (NullPointerException e) {
113 curi.addLocalizedError(getName(), e, "bad .swf file");
114 } catch (NegativeArraySizeException e) {
115 curi.addLocalizedError(getName(), e, "bad .swf file");
116 } finally {
117 IOUtils.closeQuietly(documentStream);
118 }
119
120
121 curi.linkExtractorFinished();
122 logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
123
124 }
125
126 /***
127 * Get a link extracting SWFParser.
128 *
129 * A custom SWFReader which parses links from .swf file.
130 *
131 * @param curi A CrawlURI to be processed.
132 * @return An SWFReader.
133 */
134 private SWFReader getSWFReader(CrawlURI curi, InputStream documentStream) {
135 if (documentStream == null) {
136 return null;
137 }
138
139
140
141 ExtractorSWFActions actions = new ExtractorSWFActions(curi,
142 getController());
143
144 ExtractorSWFTags tags = new ExtractorSWFTags(actions);
145
146 SWFReader reader = new ExtractorSWFReader(getTagParser(tags), documentStream);
147 return reader;
148 }
149
150 class ExtractorSWFReader extends SWFReader
151 {
152 public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
153 super(consumer, inputstream);
154 }
155
156 public ExtractorSWFReader(SWFTags consumer, InStream instream)
157 {
158 super(consumer, instream);
159 }
160
161 /***
162 * Override because a corrupt SWF file can cause us to try read
163 * lengths that are hundreds of megabytes in size causing us to
164 * OOME.
165 *
166 * Below is copied from SWFReader parent class.
167 */
168 public int readOneTag() throws IOException {
169 int header = mIn.readUI16();
170 int type = header >> 6;
171 int length = header & 0x3F;
172 boolean longTag = (length == 0x3F);
173 if (longTag) {
174 length = (int) mIn.readUI32();
175 }
176
177 if (length > MAX_READ_SIZE) {
178
179
180 mIn.skipBytes(length);
181 logger.info("oversized SWF tag (type=" + type + ";length="
182 + length + ") skipped");
183 } else {
184 byte[] contents = mIn.read(length);
185 mConsumer.tag(type, longTag, contents);
186 }
187 return type;
188 }
189 }
190
191
192 /***
193 * Get a TagParser
194 *
195 * A custom ExtractorTagParser which ignores all the big binary image/
196 * sound/font types which don't carry URLs is used, to avoid the
197 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
198 * all-in-memory SWF library handling.
199 *
200 * @param customTags
201 * A custom tag parser.
202 * @return An SWFReader.
203 */
204 private TagParser getTagParser(SWFTagTypes customTags) {
205 return new ExtractorTagParser(customTags);
206 }
207
208 /***
209 * TagParser customized to ignore SWFTags that will never contain
210 * extractable URIs.
211 */
212 protected class ExtractorTagParser extends TagParser {
213
214 protected ExtractorTagParser(SWFTagTypes tagtypes) {
215 super(tagtypes);
216 }
217
218 protected void parseDefineBits(InStream in) throws IOException {
219
220 }
221
222 protected void parseDefineBitsJPEG3(InStream in) throws IOException {
223
224 }
225
226 protected void parseDefineBitsLossless(InStream in, int length,
227 boolean hasAlpha) throws IOException {
228
229 }
230
231 protected void parseDefineButtonSound(InStream in) throws IOException {
232
233 }
234
235 protected void parseDefineFont(InStream in) throws IOException {
236
237 }
238
239 protected void parseDefineJPEG2(InStream in, int length)
240 throws IOException {
241
242 }
243
244 protected void parseDefineJPEGTables(InStream in) throws IOException {
245
246 }
247
248 protected void parseDefineShape(int type, InStream in)
249 throws IOException {
250
251 }
252
253 protected void parseDefineSound(InStream in) throws IOException {
254
255 }
256
257 protected void parseFontInfo(InStream in, int length, boolean isFI2)
258 throws IOException {
259
260 }
261
262 protected void parseDefineFont2(InStream in) throws IOException {
263
264 }
265
266
267 @Override
268 protected void parseDefineSprite( InStream in ) throws IOException
269 {
270 int id = in.readUI16();
271 in.readUI16();
272
273 SWFTagTypes sstt = mTagtypes.tagDefineSprite( id );
274
275 if( sstt == null ) return;
276
277
278 TagParser parser = new ExtractorTagParser( sstt );
279 SWFReader reader = new ExtractorSWFReader( parser, in );
280
281 reader.readTags();
282 }
283
284
285
286 @Override
287 protected void parsePlaceObject2( InStream in ) throws IOException
288 {
289 boolean hasClipActions = in.readUBits(1) != 0;
290 boolean hasClipDepth = in.readUBits(1) != 0;
291 boolean hasName = in.readUBits(1) != 0;
292 boolean hasRatio = in.readUBits(1) != 0;
293 boolean hasColorTransform = in.readUBits(1) != 0;
294 boolean hasMatrix = in.readUBits(1) != 0;
295 boolean hasCharacter = in.readUBits(1) != 0;
296 boolean isMove = in.readUBits(1) != 0;
297
298 int depth = in.readUI16();
299
300 int charId = hasCharacter ? in.readUI16() : 0;
301 Matrix matrix = hasMatrix ? new Matrix( in ) : null;
302 AlphaTransform cxform = hasColorTransform ? new AlphaTransform( in ) : null;
303 int ratio = hasRatio ? in.readUI16() : -1;
304 String name = hasName ? in.readString( mStringEncoding ) : null;
305 int clipDepth = hasClipDepth ? in.readUI16() : 0;
306
307 int clipEventFlags = 0;
308
309 if (hasClipActions) {
310 in.readUI16();
311
312
313 clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
314 }
315
316 SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
317 depth, charId, matrix, cxform, ratio, name, clipEventFlags);
318
319 if (hasClipActions && actions != null) {
320 int flags = 0;
321
322
323 while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
324 in.readUI32();
325
326 actions.start(flags);
327 ActionParser parser = new ActionParser(actions, mFlashVersion);
328
329 parser.parse(in);
330 }
331
332 actions.done();
333 }
334 }
335 }
336
337 /***
338 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
339 * parse URI-like strings.
340 */
341 protected class ExtractorSWFTags extends SWFTagTypesImpl {
342
343 private SWFActions actions;
344
345 public ExtractorSWFTags(SWFActions acts) {
346 super(null);
347 actions = acts;
348 }
349
350 public SWFActions tagDefineButton(int id, Vector buttonRecords)
351 throws IOException {
352
353 return actions;
354 }
355
356 public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
357 Vector buttonRecord2s) throws IOException {
358
359 return actions;
360 }
361
362 public SWFActions tagDoAction() throws IOException {
363 return actions;
364 }
365
366 public SWFActions tagDoInActions(int spriteId) throws IOException {
367 return actions;
368 }
369
370 public SWFTagTypes tagDefineSprite(int id) throws IOException {
371 return this;
372 }
373
374 public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
375 int depth, int charId, Matrix matrix, AlphaTransform cxform,
376 int ratio, String name, int clipActionFlags) throws IOException {
377
378 return actions;
379 }
380 }
381
382 /***
383 * SWFActions that parse URI-like strings. Links discovered using
384 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
385 * other links are marked as embedded links (hop E).
386 *
387 */
388 protected class ExtractorSWFActions extends SWFActionsImpl {
389
390 private CrawlURI curi;
391
392 private CrawlController controller;
393
394 static final String JSSTRING = "javascript:";
395
396 /***
397 * @param curi
398 * SWF URL to handle
399 * @param controller
400 * Crawl controller need for error reporting
401 */
402 public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
403 assert (curi != null) : "CrawlURI should not be null";
404 this.curi = curi;
405 this.controller = controller;
406 }
407
408 /***
409 * Overwrite handling of discovered URIs.
410 *
411 * @param url
412 * Discovered URL.
413 * @param target
414 * Discovered target (currently not being used.)
415 * @throws IOException
416 */
417 public void getURL(String url, String target) throws IOException {
418 processURIString(url);
419 }
420
421 public void lookupTable(String[] strings) throws IOException {
422 for (String str : strings) {
423 considerStringAsUri(str);
424 }
425 }
426
427 public void push(String value) throws IOException {
428 considerStringAsUri(value);
429 }
430
431 public void considerStringAsUri(String str) throws IOException {
432 Matcher uri = TextUtils.getMatcher(ExtractorJS.STRING_URI_DETECTOR,
433 str);
434
435 if (uri.matches()) {
436 curi.createAndAddLinkRelativeToVia(uri.group(),
437 Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
438 incrementLinkCount(1);
439 }
440 TextUtils.recycleMatcher(uri);
441 }
442
443 public void processURIString(String url) throws IOException {
444 if (url.startsWith(JSSTRING)) {
445 incrementLinkCount(ExtractorJS.considerStrings(
446 curi, url, controller,false));
447 } else {
448 curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
449 Link.EMBED_HOP);
450 incrementLinkCount(1);
451 }
452 }
453
454 private void incrementLinkCount(long count) {
455 numberOfLinksExtracted += count;
456 }
457 }
458
459 public String report() {
460 StringBuffer ret = new StringBuffer();
461 ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
462 ret.append(" Function: Link extraction on Shockwave Flash "
463 + "documents (.swf)\n");
464
465 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
466 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
467 return ret.toString();
468 }
469 }