1   /*
2    * Heritrix
3    *
4    * $Id: ExtractorSWF.java 6041 2008-11-18 02:42:39Z nlevitt $
5    *
6    * Created on March 19, 2004
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.util.Vector;
32  import java.util.logging.Logger;
33  import java.util.regex.Matcher;
34  
35  import org.apache.commons.io.IOUtils;
36  import org.archive.crawler.datamodel.CoreAttributeConstants;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.framework.CrawlController;
39  import org.archive.util.TextUtils;
40  
41  import com.anotherbigidea.flash.interfaces.SWFActions;
42  import com.anotherbigidea.flash.interfaces.SWFTagTypes;
43  import com.anotherbigidea.flash.interfaces.SWFTags;
44  import com.anotherbigidea.flash.readers.ActionParser;
45  import com.anotherbigidea.flash.readers.SWFReader;
46  import com.anotherbigidea.flash.readers.TagParser;
47  import com.anotherbigidea.flash.structs.AlphaTransform;
48  import com.anotherbigidea.flash.structs.Matrix;
49  import com.anotherbigidea.flash.writers.SWFActionsImpl;
50  import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
51  import com.anotherbigidea.io.InStream;
52  
53  /***
54   * Process SWF (flash/shockwave) files for strings that are likely to be
55   * crawlable URIs.
56   * 
57   * @author Igor Ranitovic
58   */
59  public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
60  
61  	private static final long serialVersionUID = 3627359592408010589L;
62  
63  	private static Logger logger = Logger.getLogger(ExtractorSWF.class
64  			.getName());
65  
66  	protected long numberOfCURIsHandled = 0;
67  
68  	protected long numberOfLinksExtracted = 0;
69  
70  	// TODO: consider if this should be even smaller, because anything
71  	// containing URLs wouldn't be this big
72  	private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB
73  
74  	/***
75  	 * @param name
76  	 */
77  	public ExtractorSWF(String name) {
78  		super(name, "Flash extractor. Extracts URIs from SWF "
79  				+ "(flash/shockwave) files.");
80  	}
81  
82  	protected void extract(CrawlURI curi) {
83  		if (!isHttpTransactionContentToProcess(curi)) {
84  			return;
85  		}
86  
87  		String contentType = curi.getContentType();
88  		if (contentType == null) {
89  			return;
90  		}
91  		if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
92  				&& (!curi.toString().toLowerCase().endsWith(".swf"))) {
93  			return;
94  		}
95  
96          InputStream documentStream = null;
97  		try {
98              documentStream = 
99                  curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
100             
101             // Get link extracting SWF reader
102             SWFReader reader = getSWFReader(curi, documentStream);
103             if (reader == null) {
104                 return;
105             }
106 
107             numberOfCURIsHandled++;
108 			// Parse file for links
109 			reader.readFile();
110 		} catch (IOException e) {
111 			curi.addLocalizedError(getName(), e, "failed reading");
112 		} catch (NullPointerException e) {
113 			curi.addLocalizedError(getName(), e, "bad .swf file");
114 		} catch (NegativeArraySizeException e) {
115 			curi.addLocalizedError(getName(), e, "bad .swf file");
116 		} finally {
117 		    IOUtils.closeQuietly(documentStream);
118         }
119 
120 		// Set flag to indicate that link extraction is completed.
121 		curi.linkExtractorFinished();
122 		logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
123 
124 	}
125 
126 	/***
127 	 * Get a link extracting SWFParser.
128 	 * 
129 	 * A custom SWFReader which parses links from .swf file.
130 	 * 
131 	 * @param curi A CrawlURI to be processed.
132 	 * @return An SWFReader.
133 	 */
134 	private SWFReader getSWFReader(CrawlURI curi, InputStream documentStream) {
135         if (documentStream == null) {
136             return null;
137         }
138 
139 		// Create SWF actions that will add discoved URIs to CrawlURI
140 		// alist(s).
141 		ExtractorSWFActions actions = new ExtractorSWFActions(curi,
142 				getController());
143 		// Overwrite parsing of specific tags that might have URIs.
144 		ExtractorSWFTags tags = new ExtractorSWFTags(actions);
145 		// Get a SWFReader instance.
146 		SWFReader reader = new ExtractorSWFReader(getTagParser(tags), documentStream);
147 		return reader;
148 	}
149 
150 	class ExtractorSWFReader extends SWFReader
151 	{
152 	    public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
153 	        super(consumer, inputstream);
154 	    }
155 	    
156 	    public ExtractorSWFReader(SWFTags consumer, InStream instream)
157 	    {
158 	        super(consumer, instream);
159 	    }    
160 
161 	    /***
162          * Override because a corrupt SWF file can cause us to try read
163          * lengths that are hundreds of megabytes in size causing us to
164          * OOME.
165          * 
166          * Below is copied from SWFReader parent class.
167          */
168         public int readOneTag() throws IOException {
169             int header = mIn.readUI16();
170             int type = header >> 6; // only want the top 10 bits
171             int length = header & 0x3F; // only want the bottom 6 bits
172             boolean longTag = (length == 0x3F);
173             if (longTag) {
174                 length = (int) mIn.readUI32();
175             }
176             // Below test added for Heritrix use.
177             if (length > MAX_READ_SIZE) {
178                 // skip to next, rather than throw IOException ending
179                 // processing
180                 mIn.skipBytes(length);
181                 logger.info("oversized SWF tag (type=" + type + ";length="
182                         + length + ") skipped");
183             } else {
184                 byte[] contents = mIn.read(length);
185                 mConsumer.tag(type, longTag, contents);
186             }
187             return type;
188         }
189     }
190 
191 
192 	/***
193 	 * Get a TagParser
194 	 * 
195 	 * A custom ExtractorTagParser which ignores all the big binary image/
196 	 * sound/font types which don't carry URLs is used, to avoid the
197 	 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
198 	 * all-in-memory SWF library handling.
199 	 * 
200 	 * @param customTags
201 	 *            A custom tag parser.
202 	 * @return An SWFReader.
203 	 */
204 	private TagParser getTagParser(SWFTagTypes customTags) {
205 		return new ExtractorTagParser(customTags);
206 	}
207 
208 	/***
209 	 * TagParser customized to ignore SWFTags that will never contain
210 	 * extractable URIs.
211 	 */
212 	protected class ExtractorTagParser extends TagParser {
213 
214 		protected ExtractorTagParser(SWFTagTypes tagtypes) {
215 			super(tagtypes);
216 		}
217 
218 		protected void parseDefineBits(InStream in) throws IOException {
219 			// DO NOTHING - no URLs to be found in bits
220 		}
221 
222 		protected void parseDefineBitsJPEG3(InStream in) throws IOException {
223 			// DO NOTHING - no URLs to be found in bits
224 		}
225 
226 		protected void parseDefineBitsLossless(InStream in, int length,
227 				boolean hasAlpha) throws IOException {
228 			// DO NOTHING - no URLs to be found in bits
229 		}
230 
231 		protected void parseDefineButtonSound(InStream in) throws IOException {
232 			// DO NOTHING - no URLs to be found in sound
233 		}
234 
235 		protected void parseDefineFont(InStream in) throws IOException {
236 			// DO NOTHING - no URLs to be found in font
237 		}
238 
239 		protected void parseDefineJPEG2(InStream in, int length)
240 				throws IOException {
241 			// DO NOTHING - no URLs to be found in jpeg
242 		}
243 
244 		protected void parseDefineJPEGTables(InStream in) throws IOException {
245 			// DO NOTHING - no URLs to be found in jpeg
246 		}
247 
248 		protected void parseDefineShape(int type, InStream in)
249 				throws IOException {
250 			// DO NOTHING - no URLs to be found in shape
251 		}
252 
253 		protected void parseDefineSound(InStream in) throws IOException {
254 			// DO NOTHING - no URLs to be found in sound
255 		}
256 
257 		protected void parseFontInfo(InStream in, int length, boolean isFI2)
258 				throws IOException {
259 			// DO NOTHING - no URLs to be found in font info
260 		}
261 
262 		protected void parseDefineFont2(InStream in) throws IOException {
263 			// DO NOTHING - no URLs to be found in bits
264 		}
265 		
266 		// heritrix: Overridden to use our TagParser and SWFReader. The rest of the code is the same.
267 		@Override
268 	    protected void parseDefineSprite( InStream in ) throws IOException
269 	    {
270 	        int id         = in.readUI16();
271 	        in.readUI16(); // frame count
272 	        
273 	        SWFTagTypes sstt = mTagtypes.tagDefineSprite( id );
274 	        
275 	        if( sstt == null ) return;
276 	        
277 	        // heritrix: only these two lines differ from super.parseDefineSprite()
278 	        TagParser parser = new ExtractorTagParser( sstt );
279 	        SWFReader reader = new ExtractorSWFReader( parser, in );
280 	        
281 	        reader.readTags();
282 	    }
283 
284 		// Overridden to read 32 bit clip event flags when flash version >= 6.
285         // All the rest of the code is copied directly. Fixes HER-1509.
286 		@Override
287 	    protected void parsePlaceObject2( InStream in ) throws IOException
288 	    {
289 	        boolean hasClipActions    = in.readUBits(1) != 0;
290 	        boolean hasClipDepth      = in.readUBits(1) != 0;
291 	        boolean hasName           = in.readUBits(1) != 0;
292 	        boolean hasRatio          = in.readUBits(1) != 0;
293 	        boolean hasColorTransform = in.readUBits(1) != 0;
294 	        boolean hasMatrix         = in.readUBits(1) != 0;
295 	        boolean hasCharacter      = in.readUBits(1) != 0;
296 	        boolean isMove            = in.readUBits(1) != 0;
297 	    
298 	        int depth = in.readUI16();
299 	        
300 	        int            charId    = hasCharacter      ? in.readUI16()            : 0;
301 	        Matrix         matrix    = hasMatrix         ? new Matrix( in )         : null;
302 	        AlphaTransform cxform    = hasColorTransform ? new AlphaTransform( in ) : null;
303 	        int            ratio     = hasRatio          ? in.readUI16()            : -1;        
304 	        String         name      = hasName           ? in.readString( mStringEncoding )  : null;  
305 	        int            clipDepth = hasClipDepth      ? in.readUI16()            : 0;
306 	        
307 	        int clipEventFlags = 0;
308 	        
309 	        if (hasClipActions) {
310                 in.readUI16(); // reserved
311 
312                 // heritrix: flags size changed in swf version 6
313                 clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
314             }
315 	        
316 	        SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
317                     depth, charId, matrix, cxform, ratio, name, clipEventFlags);
318 
319             if (hasClipActions && actions != null) {
320                 int flags = 0;
321 
322                 // heritrix: flags size changed in swf version 6
323                 while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
324                     in.readUI32(); // length
325 
326                     actions.start(flags);
327                     ActionParser parser = new ActionParser(actions, mFlashVersion);
328 
329                     parser.parse(in);
330                 }
331 
332                 actions.done();
333             }
334         }
335 	}
336 
337 	/***
338 	 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
339 	 * parse URI-like strings.
340 	 */
341 	protected class ExtractorSWFTags extends SWFTagTypesImpl {
342 
343 		private SWFActions actions;
344 
345 		public ExtractorSWFTags(SWFActions acts) {
346 			super(null);
347 			actions = acts;
348 		}
349 
350 		public SWFActions tagDefineButton(int id, Vector buttonRecords)
351 				throws IOException {
352 
353 			return actions;
354 		}
355 
356 		public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
357 				Vector buttonRecord2s) throws IOException {
358 
359 			return actions;
360 		}
361 
362 		public SWFActions tagDoAction() throws IOException {
363 			return actions;
364 		}
365 
366 		public SWFActions tagDoInActions(int spriteId) throws IOException {
367 			return actions;
368 		}
369 
370 		public SWFTagTypes tagDefineSprite(int id) throws IOException {
371 			return this;
372 		}
373 
374 		public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
375 				int depth, int charId, Matrix matrix, AlphaTransform cxform,
376 				int ratio, String name, int clipActionFlags) throws IOException {
377 
378 			return actions;
379 		}
380 	}
381 
382 	/***
383 	 * SWFActions that parse URI-like strings. Links discovered using
384 	 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
385 	 * other links are marked as embedded links (hop E).
386 	 * 
387 	 */
388 	protected class ExtractorSWFActions extends SWFActionsImpl {
389 
390 		private CrawlURI curi;
391 
392 		private CrawlController controller;
393 
394 		static final String JSSTRING = "javascript:";
395 
396 		/***
397 		 * @param curi
398 		 *            SWF URL to handle
399 		 * @param controller
400 		 *            Crawl controller need for error reporting
401 		 */
402 		public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
403 			assert (curi != null) : "CrawlURI should not be null";
404 			this.curi = curi;
405 			this.controller = controller;
406 		}
407 
408 		/***
409 		 * Overwrite handling of discovered URIs.
410 		 * 
411 		 * @param url
412 		 *            Discovered URL.
413 		 * @param target
414 		 *            Discovered target (currently not being used.)
415 		 * @throws IOException
416 		 */
417 		public void getURL(String url, String target) throws IOException {
418 			processURIString(url);
419 		}
420 
421 		public void lookupTable(String[] strings) throws IOException {
422 			for (String str : strings) {
423 				considerStringAsUri(str);
424 			}
425 		}
426 
427 		public void push(String value) throws IOException {
428 			considerStringAsUri(value);
429 		}
430 
431 		public void considerStringAsUri(String str) throws IOException {
432 			Matcher uri = TextUtils.getMatcher(ExtractorJS.STRING_URI_DETECTOR,
433 					str);
434 
435 			if (uri.matches()) {
436 				curi.createAndAddLinkRelativeToVia(uri.group(),
437 						Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
438 				incrementLinkCount(1);
439 			}
440 			TextUtils.recycleMatcher(uri);
441 		}
442 
443 		public void processURIString(String url) throws IOException {
444 			if (url.startsWith(JSSTRING)) {
445 				incrementLinkCount(ExtractorJS.considerStrings(
446 						curi, url, controller,false));
447 			} else {
448 				curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
449 						Link.EMBED_HOP);
450 				incrementLinkCount(1);
451 			}
452 		}
453 
454 		private void incrementLinkCount(long count) {
455 			numberOfLinksExtracted += count;
456 		}
457 	}
458 
459 	public String report() {
460 		StringBuffer ret = new StringBuffer();
461 		ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
462 		ret.append("  Function:          Link extraction on Shockwave Flash "
463 				+ "documents (.swf)\n");
464 
465 		ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
466 		ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
467 		return ret.toString();
468 	}
469 }