1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29 import java.util.regex.Matcher;
30
31 import org.apache.commons.codec.DecoderException;
32 import org.apache.commons.httpclient.URIException;
33 import org.archive.crawler.datamodel.CoreAttributeConstants;
34 import org.archive.crawler.datamodel.CrawlURI;
35 import org.archive.crawler.framework.CrawlController;
36 import org.archive.io.ReplayCharSequence;
37 import org.archive.net.LaxURLCodec;
38 import org.archive.net.UURI;
39 import org.archive.util.ArchiveUtils;
40 import org.archive.util.DevUtils;
41 import org.archive.util.TextUtils;
42
43 /***
44 * Processes Javascript files for strings that are likely to be
45 * crawlable URIs.
46 *
47 * @contributor gojomo
48 * @contributor szznax
49 *
50 */
51 public class ExtractorJS extends Extractor implements CoreAttributeConstants {
52
53 private static final long serialVersionUID = -2231962381454717720L;
54
55 private static Logger LOGGER =
56 Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
57
58 static final String AMP = "&";
59 static final String ESCAPED_AMP = "&";
60 static final String WHITESPACE = "//s";
61
62
63
64
65 static final String JAVASCRIPT_STRING_EXTRACTOR =
66 "(////{0,8}+(?:\"|\'))(//S{0,"+UURI.MAX_URL_LENGTH+"}?)(?://1)";
67
68
69
70
71
72
73
74 static final String STRING_URI_DETECTOR =
75 "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)";
76
77 protected long numberOfCURIsHandled = 0;
78 protected static long numberOfLinksExtracted = 0;
79
80
81
82
83 protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
84 "text/javascript"
85 };
86
87
88
89
90
91
92 protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
93 "http://www.google-analytics.com/urchin.js"
94 };
95
96 /***
97 * @param name
98 */
99 public ExtractorJS(String name) {
100 super(name, "JavaScript extractor. Link extraction on JavaScript" +
101 " files (.js).");
102 }
103
104
105
106
107 public void extract(CrawlURI curi) {
108
109
110 for (String s: EXTRACTOR_URI_EXCEPTIONS) {
111 if (curi.toString().equals(s))
112 return;
113 }
114
115 if (!isHttpTransactionContentToProcess(curi)) {
116 return;
117 }
118 String contentType = curi.getContentType();
119 if ((contentType == null)) {
120 return;
121 }
122
123
124 if((contentType.indexOf("javascript") < 0) &&
125 (contentType.indexOf("jscript") < 0) &&
126 (contentType.indexOf("ecmascript") < 0) &&
127 (!curi.toString().toLowerCase().endsWith(".js")) &&
128 (curi.getViaContext() == null || !curi.getViaContext().
129 toString().toLowerCase().startsWith("script"))) {
130 return;
131 }
132
133 this.numberOfCURIsHandled++;
134
135 ReplayCharSequence cs = null;
136 try {
137 cs = curi.getHttpRecorder().getReplayCharSequence();
138 } catch (IOException e) {
139 curi.addLocalizedError(this.getName(), e,
140 "Failed get of replay char sequence.");
141 }
142 if (cs == null) {
143 LOGGER.warning("Failed getting ReplayCharSequence: " +
144 curi.toString());
145 return;
146 }
147
148 try {
149 try {
150 numberOfLinksExtracted += considerStrings(curi, cs,
151 getController(), true);
152 } catch (StackOverflowError e) {
153 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
154 }
155
156 curi.linkExtractorFinished();
157 } finally {
158
159 if (cs != null) {
160 try {
161 cs.close();
162 } catch (IOException ioe) {
163 LOGGER.warning(TextUtils.exceptionToString(
164 "Failed close of ReplayCharSequence.", ioe));
165 }
166 }
167 }
168 }
169
170 public static long considerStrings(CrawlURI curi, CharSequence cs,
171 CrawlController controller, boolean handlingJSFile) {
172 long foundLinks = 0;
173 Matcher strings =
174 TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
175 while(strings.find()) {
176 CharSequence subsequence =
177 cs.subSequence(strings.start(2), strings.end(2));
178 Matcher uri =
179 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence);
180 if(uri.matches()) {
181 String string = uri.group();
182
183 if (isUriMatchException(string,cs)) {
184 TextUtils.recycleMatcher(uri);
185 continue;
186 }
187 string = speculativeFixup(string, curi);
188 foundLinks++;
189 try {
190 if (handlingJSFile) {
191 curi.createAndAddLinkRelativeToVia(string,
192 Link.JS_MISC, Link.SPECULATIVE_HOP);
193 } else {
194 curi.createAndAddLinkRelativeToBase(string,
195 Link.JS_MISC, Link.SPECULATIVE_HOP);
196 }
197 } catch (URIException e) {
198
199
200 if (controller != null) {
201 controller.logUriError(e, curi.getUURI(), string);
202 } else {
203 LOGGER.info(curi + ", " + string + ": " +
204 e.getMessage());
205 }
206 }
207 } else {
208 foundLinks += considerStrings(curi, subsequence,
209 controller, handlingJSFile);
210 }
211 TextUtils.recycleMatcher(uri);
212 }
213 TextUtils.recycleMatcher(strings);
214 return foundLinks;
215 }
216
217 /***
218 * checks to see if URI match is a special case
219 * @param string matched by <code>STRING_URI_DETECTOR</code>
220 * @param cs
221 * @return true if string is one of <code>STRING_URI_EXCEPTIONS</code>
222 */
223 private static boolean isUriMatchException(String string,CharSequence cs) {
224 for (String s : STRING_URI_DETECTOR_EXCEPTIONS) {
225 if (s.equals(string))
226 return true;
227 }
228 return false;
229 }
230
231 /***
232 * Perform additional fixup of likely-URI Strings
233 *
234 * @param string detected candidate String
235 * @return String changed/decoded to increase liklihood it is a
236 * meaningful non-404 URI
237 */
238 public static String speculativeFixup(String string, CrawlURI curi) {
239 String retVal = string;
240
241
242 retVal = TextUtils.replaceAll(ESCAPED_AMP, retVal, AMP);
243
244
245 Matcher m = TextUtils.getMatcher("(?i)^https?%3A.*",retVal);
246 if(m.matches()) {
247 try {
248 retVal = LaxURLCodec.DEFAULT.decode(retVal);
249 } catch (DecoderException e) {
250 LOGGER.log(Level.INFO,"unable to decode",e);
251 }
252 }
253 TextUtils.recycleMatcher(m);
254
255
256
257
258
259
260 m = TextUtils.getMatcher(
261 "^[^//./://s%]+//.[^/://s%]+//.([^//./://s%]+)(/.*|)$",
262 retVal);
263 if(m.matches()) {
264 if(ArchiveUtils.isTld(m.group(1))) {
265 String schemePlus = "http://";
266
267 try {
268 if (retVal.startsWith(curi.getUURI().getHost())) {
269 schemePlus = curi.getUURI().getScheme() + "://";
270 }
271 } catch (URIException e) {
272
273 }
274 retVal = schemePlus + retVal;
275 }
276 }
277 TextUtils.recycleMatcher(m);
278
279 return retVal;
280 }
281
282
283
284
285
286
287 public String report() {
288 StringBuffer ret = new StringBuffer();
289 ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
290 ret.append(" Function: Link extraction on JavaScript code\n");
291 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
292 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
293
294 return ret.toString();
295 }
296 }