1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.crawler.settings.Type;
39 import org.archive.io.ReplayCharSequence;
40 import org.archive.net.UURI;
41 import org.archive.net.UURIFactory;
42 import org.archive.util.DevUtils;
43 import org.archive.util.HttpRecorder;
44 import org.archive.util.TextUtils;
45
46 /***
47 * Basic link-extraction, from an HTML content-body,
48 * using regular expressions.
49 *
50 * @author gojomo
51 *
52 */
53 public class ExtractorHTML extends Extractor
54 implements CoreAttributeConstants {
55
56 private static final long serialVersionUID = 5855731422080471017L;
57
58 private static Logger logger =
59 Logger.getLogger(ExtractorHTML.class.getName());
60
61 /***
62 * Compiled relevant tag extractor.
63 *
64 * <p>
65 * This pattern extracts either:
66 * <li> (1) whole <script>...</script> or
67 * <li> (2) <style>...</style> or
68 * <li> (3) <meta ...> or
69 * <li> (4) any other open-tag with at least one attribute
70 * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
71 * <p>
72 * groups:
73 * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
74 * <li> 2: just script open tag
75 * <li> 3: STYLE TYPE=moo>zoo</STYLE
76 * <li> 4: just style open tag
77 * <li> 5: entire other tag, without '<' '>'
78 * <li> 6: element
79 * <li> 7: META
80 * <li> 8: !-- comment --
81 */
82
83 private static final int MAX_ELEMENT_LENGTH =
84 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85 ".maxElementNameLength", "1024"));
86
87 static final String RELEVANT_TAG_EXTRACTOR =
88 "(?is)<(?:((script[^>]*+)>.*?</script)" +
89 "|((style[^>]*+)>.*?</style)" +
90 "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" +
91 "|(!--.*?--))>";
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 private static final int MAX_ATTR_NAME_LENGTH =
109 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110 ".maxAttributeNameLength", "1024"));
111
112 static final int MAX_ATTR_VAL_LENGTH =
113 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114 ".maxAttributeValueLength", "16384"));
115
116
117
118
119
120
121 static final String EACH_ATTRIBUTE_EXTRACTOR =
122 "(?is)//b((href)|(action)|(on//w*)"
123 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
124 +"|(?:usemap)|(?:profile)|(?:datasrc))"
125 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
126 +"|(value)|(style)|(method)"
127 +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))"
128 +"//s*=//s*"
129 +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))"
130 +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))"
131 +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))";
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158 static final String LIKELY_URI_PATH =
159 "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
160 static final String WHITESPACE = "//s";
161 static final String CLASSEXT =".class";
162 static final String APPLET = "applet";
163 static final String BASE = "base";
164 static final String LINK = "link";
165 static final String FRAME = "frame";
166 static final String IFRAME = "iframe";
167
168 public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
169 "treat-frames-as-embed-links";
170
171 public static final String ATTR_IGNORE_FORM_ACTION_URLS =
172 "ignore-form-action-urls";
173
174 public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
175 "extract-only-form-gets";
176
177 /*** whether to try finding links in Javscript; default true */
178 public static final String ATTR_EXTRACT_JAVASCRIPT =
179 "extract-javascript";
180
181 public static final String EXTRACT_VALUE_ATTRIBUTES =
182 "extract-value-attributes";
183
184 public static final String ATTR_IGNORE_UNEXPECTED_HTML =
185 "ignore-unexpected-html";
186
187
188 protected long numberOfCURIsHandled = 0;
189 protected long numberOfLinksExtracted = 0;
190
191 public ExtractorHTML(String name) {
192 this(name, "HTML extractor. Extracts links from HTML documents");
193 }
194
195 public ExtractorHTML(String name, String description) {
196 super(name, description);
197 Type t = addElementToDefinition(
198 new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
199 "If true, in-page Javascript is scanned for strings that " +
200 "appear likely to be URIs. This typically finds both valid " +
201 "and invalid URIs, and attempts to fetch the invalid URIs " +
202 "sometimes generates webmaster concerns over odd crawler " +
203 "behavior. Default is true.",
204 Boolean.TRUE));
205 t.setExpertSetting(true);
206 t = addElementToDefinition(
207 new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
208 "If true, FRAME/IFRAME SRC-links are treated as embedded " +
209 "resources (like IMG, 'E' hop-type), otherwise they are " +
210 "treated as navigational links. Default is true.", Boolean.TRUE));
211 t.setExpertSetting(true);
212 t = addElementToDefinition(
213 new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
214 "If true, URIs appearing as the ACTION attribute in " +
215 "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
216 t.setExpertSetting(true);
217 t = addElementToDefinition(
218 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
219 "If true, only HTML FORM ACTIONs associated with the GET "+
220 "method are extracted. (Form ACTIONs with method POST "+
221 "will be ignored. Default is true", Boolean.TRUE));
222 t.setExpertSetting(true);
223 t = addElementToDefinition(
224 new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
225 "If true, strings that look like URIs found in element VALUE " +
226 "attributes (which are sometimes used as URIs by in-page " +
227 "Javascript or server-side redirects) will be extracted. " +
228 "This typically finds both valid and invalid URIs, and " +
229 "attempts to fetch the invalid URIs sometimes generate " +
230 "webmaster concerns over odd crawler behavior. Default " +
231 "is true.",
232 Boolean.TRUE));
233 t.setExpertSetting(true);
234 t = addElementToDefinition(
235 new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
236 "If true, URIs which end in typical non-HTML extensions " +
237 "(such as .gif) will not be scanned as if it were HTML. " +
238 "Default is true.", Boolean.TRUE));
239 t.setExpertSetting(true);
240 }
241
242 protected void processGeneralTag(CrawlURI curi, CharSequence element,
243 CharSequence cs) {
244
245 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
246
247
248 String codebase = null;
249 ArrayList<String> resources = null;
250
251
252 CharSequence action = null;
253 CharSequence actionContext = null;
254 CharSequence method = null;
255
256 final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
257 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
258
259 final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
260 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
261
262 final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
263 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
264
265 final String elementStr = element.toString();
266
267 while (attr.find()) {
268 int valueGroup =
269 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270 int start = attr.start(valueGroup);
271 int end = attr.end(valueGroup);
272 assert start >= 0: "Start is: " + start + ", " + curi;
273 assert end >= 0: "End is :" + end + ", " + curi;
274 CharSequence value = cs.subSequence(start, end);
275 value = TextUtils.unescapeHtml(value);
276 if (attr.start(2) > -1) {
277
278 CharSequence context =
279 Link.elementContext(element, attr.group(2));
280 if(elementStr.equalsIgnoreCase(LINK)) {
281
282 processEmbed(curi, value, context);
283 } else {
284
285 processLink(curi, value, context);
286 }
287 if (elementStr.equalsIgnoreCase(BASE)) {
288 try {
289 curi.setBaseURI(value.toString());
290 } catch (URIException e) {
291 if (getController() != null) {
292
293
294 getController().logUriError(e, curi.getUURI(),
295 value.toString());
296 } else {
297 logger.info("Failed set base uri: " +
298 curi + ", " + value.toString() + ": " +
299 e.getMessage());
300 }
301 }
302 }
303 } else if (attr.start(3) > -1) {
304
305 if (!ignoreFormActions) {
306 action = value;
307 actionContext = Link.elementContext(element,
308 attr.group(3));
309
310 }
311 } else if (attr.start(4) > -1) {
312
313 processScriptCode(curi, value);
314 } else if (attr.start(5) > -1) {
315
316 CharSequence context = Link.elementContext(element,
317 attr.group(5));
318
319
320 final char hopType;
321
322 if(!framesAsEmbeds
323 && (elementStr.equalsIgnoreCase(FRAME) || elementStr
324 .equalsIgnoreCase(IFRAME))) {
325 hopType = Link.NAVLINK_HOP;
326 } else {
327 hopType = Link.EMBED_HOP;
328 }
329 processEmbed(curi, value, context, hopType);
330 } else if (attr.start(6) > -1) {
331
332 codebase = (value instanceof String)?
333 (String)value: value.toString();
334 CharSequence context = Link.elementContext(element,
335 attr.group(6));
336 processEmbed(curi, codebase, context);
337 } else if (attr.start(7) > -1) {
338
339 if (resources == null) {
340 resources = new ArrayList<String>();
341 }
342 resources.add(value.toString());
343 } else if (attr.start(8) > -1) {
344
345 if (resources==null) {
346 resources = new ArrayList<String>();
347 }
348 String[] multi = TextUtils.split(WHITESPACE, value);
349 for(int i = 0; i < multi.length; i++ ) {
350 resources.add(multi[i]);
351 }
352 } else if (attr.start(9) > -1) {
353
354 if (resources==null) {
355 resources = new ArrayList<String>();
356 }
357
358
359 if (elementStr.equalsIgnoreCase(APPLET) &&
360 !value.toString().toLowerCase().endsWith(CLASSEXT)) {
361 resources.add(value.toString() + CLASSEXT);
362 } else {
363 resources.add(value.toString());
364 }
365 } else if (attr.start(10) > -1) {
366
367 if (extractValueAttributes
368 && TextUtils.matches(LIKELY_URI_PATH, value)) {
369 CharSequence context = Link.elementContext(element,
370 attr.group(10));
371 processLink(curi,value, context);
372 }
373
374 } else if (attr.start(11) > -1) {
375
376
377 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
378 curi, value, getController());
379
380 } else if (attr.start(12) > -1) {
381
382 method = value;
383
384 } else if (attr.start(13) > -1) {
385
386
387
388
389
390 }
391 }
392 TextUtils.recycleMatcher(attr);
393
394
395 if (resources != null) {
396 Iterator iter = resources.iterator();
397 UURI codebaseURI = null;
398 String res = null;
399 try {
400 if (codebase != null) {
401
402 codebaseURI = UURIFactory.
403 getInstance(curi.getUURI(), codebase);
404 }
405 while(iter.hasNext()) {
406 res = iter.next().toString();
407 res = (String) TextUtils.unescapeHtml(res);
408 if (codebaseURI != null) {
409 res = codebaseURI.resolve(res).toString();
410 }
411 processEmbed(curi, res, element);
412 }
413 } catch (URIException e) {
414 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
415 } catch (IllegalArgumentException e) {
416 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
417 "codebase=" + codebase + " res=" + res + "\n" +
418 DevUtils.extraInfo(), e);
419 }
420 }
421
422
423 if(action != null) {
424 if(method == null || "GET".equalsIgnoreCase(method.toString())
425 || ! ((Boolean)getUncheckedAttribute(curi,
426 ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
427 processLink(curi, action, actionContext);
428 }
429 }
430 }
431
432 /***
433 * Extract the (java)script source in the given CharSequence.
434 *
435 * @param curi source CrawlURI
436 * @param cs CharSequence of javascript code
437 */
438 protected void processScriptCode(CrawlURI curi, CharSequence cs) {
439 if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
440 this.numberOfLinksExtracted +=
441 ExtractorJS.considerStrings(curi, cs, getController(), false);
442 }
443 }
444
445 static final String JAVASCRIPT = "(?i)^javascript:.*";
446
447 /***
448 * Handle generic HREF cases.
449 *
450 * @param curi
451 * @param value
452 * @param context
453 */
454 protected void processLink(CrawlURI curi, final CharSequence value,
455 CharSequence context) {
456 if (TextUtils.matches(JAVASCRIPT, value)) {
457 processScriptCode(curi, value. subSequence(11, value.length()));
458 } else {
459 if (logger.isLoggable(Level.FINEST)) {
460 logger.finest("link: " + value.toString() + " from " + curi);
461 }
462 addLinkFromString(curi,
463 (value instanceof String)?
464 (String)value: value.toString(),
465 context, Link.NAVLINK_HOP);
466 this.numberOfLinksExtracted++;
467 }
468 }
469
470 private void addLinkFromString(CrawlURI curi, String uri,
471 CharSequence context, char hopType) {
472 try {
473
474
475
476
477 curi.createAndAddLinkRelativeToBase(uri, context.toString(),
478 hopType);
479 } catch (URIException e) {
480 if (getController() != null) {
481 getController().logUriError(e, curi.getUURI(), uri);
482 } else {
483 logger.info("Failed createAndAddLinkRelativeToBase " +
484 curi + ", " + uri + ", " + context + ", " + hopType +
485 ": " + e);
486 }
487 }
488 }
489
490 protected final void processEmbed(CrawlURI curi, CharSequence value,
491 CharSequence context) {
492 processEmbed(curi, value, context, Link.EMBED_HOP);
493 }
494
495 protected void processEmbed(CrawlURI curi, final CharSequence value,
496 CharSequence context, char hopType) {
497 if (logger.isLoggable(Level.FINEST)) {
498 logger.finest("embed (" + hopType + "): " + value.toString() +
499 " from " + curi);
500 }
501 addLinkFromString(curi,
502 (value instanceof String)?
503 (String)value: value.toString(),
504 context, hopType);
505 this.numberOfLinksExtracted++;
506 }
507
508 public void extract(CrawlURI curi) {
509 if (!isHttpTransactionContentToProcess(curi) ||
510 ! (isExpectedMimeType(curi.getContentType(), "text/html")
511 || isExpectedMimeType(curi.getContentType(), "application/xhtml")
512 || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml")
513 || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml")
514 || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) {
515 return;
516 }
517
518 final boolean ignoreUnexpectedHTML =
519 ((Boolean)getUncheckedAttribute(curi,
520 ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
521
522 if (ignoreUnexpectedHTML) {
523 try {
524 if(!isHtmlExpectedHere(curi)) {
525
526
527 return;
528 }
529 } catch (URIException e) {
530 logger.severe("Failed expectedHTML test: " + e.getMessage());
531 }
532 }
533
534 this.numberOfCURIsHandled++;
535
536 ReplayCharSequence cs = null;
537
538 try {
539 HttpRecorder hr = curi.getHttpRecorder();
540 if (hr == null) {
541 throw new IOException("Why is recorder null here?");
542 }
543 cs = hr.getReplayCharSequence();
544 } catch (IOException e) {
545 curi.addLocalizedError(this.getName(), e,
546 "Failed get of replay char sequence " + curi.toString() +
547 " " + e.getMessage());
548 logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
549 Thread.currentThread().getName(), e);
550 }
551
552 if (cs == null) {
553 return;
554 }
555
556
557
558 try {
559
560 extract(curi, cs);
561
562 curi.linkExtractorFinished();
563 } finally {
564 if (cs != null) {
565 try {
566 cs.close();
567 } catch (IOException ioe) {
568 logger.warning(TextUtils.exceptionToString(
569 "Failed close of ReplayCharSequence.", ioe));
570 }
571 }
572 }
573 }
574
575 /***
576 * Run extractor.
577 * This method is package visible to ease testing.
578 * @param curi CrawlURI we're processing.
579 * @param cs Sequence from underlying ReplayCharSequence. This
580 * is TRANSIENT data. Make a copy if you want the data to live outside
581 * of this extractors' lifetime.
582 */
583 void extract(CrawlURI curi, CharSequence cs) {
584 Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
585 while(tags.find()) {
586 if(Thread.interrupted()){
587 break;
588 }
589 if (tags.start(8) > 0) {
590
591
592 } else if (tags.start(7) > 0) {
593
594 int start = tags.start(5);
595 int end = tags.end(5);
596 assert start >= 0: "Start is: " + start + ", " + curi;
597 assert end >= 0: "End is :" + end + ", " + curi;
598 if (processMeta(curi,
599 cs.subSequence(start, end))) {
600
601
602 break;
603 }
604 } else if (tags.start(5) > 0) {
605
606 int start5 = tags.start(5);
607 int end5 = tags.end(5);
608 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
609 assert end5 >= 0: "End is :" + end5 + ", " + curi;
610 int start6 = tags.start(6);
611 int end6 = tags.end(6);
612 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
613 assert end6 >= 0: "End is :" + end6 + ", " + curi;
614 processGeneralTag(curi,
615 cs.subSequence(start6, end6),
616 cs.subSequence(start5, end5));
617
618 } else if (tags.start(1) > 0) {
619
620 int start = tags.start(1);
621 int end = tags.end(1);
622 assert start >= 0: "Start is: " + start + ", " + curi;
623 assert end >= 0: "End is :" + end + ", " + curi;
624 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
625 ", " + curi;
626 processScript(curi, cs.subSequence(start, end),
627 tags.end(2) - start);
628
629 } else if (tags.start(3) > 0){
630
631 int start = tags.start(3);
632 int end = tags.end(3);
633 assert start >= 0: "Start is: " + start + ", " + curi;
634 assert end >= 0: "End is :" + end + ", " + curi;
635 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
636 ", " + curi;
637 processStyle(curi, cs.subSequence(start, end),
638 tags.end(4) - start);
639 }
640 }
641 TextUtils.recycleMatcher(tags);
642 }
643
644
645 static final String NON_HTML_PATH_EXTENSION =
646 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
647 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
648
649 /***
650 * Test whether this HTML is so unexpected (eg in place of a GIF URI)
651 * that it shouldn't be scanned for links.
652 *
653 * @param curi CrawlURI to examine.
654 * @return True if HTML is acceptable/expected here
655 * @throws URIException
656 */
657 protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
658 String path = curi.getUURI().getPath();
659 if(path==null) {
660
661 return true;
662 }
663 int dot = path.lastIndexOf('.');
664 if (dot < 0) {
665
666 return true;
667 }
668 if(dot<(path.length()-5)) {
669
670 return true;
671 }
672 String ext = path.substring(dot+1);
673 return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
674 }
675
676 protected void processScript(CrawlURI curi, CharSequence sequence,
677 int endOfOpenTag) {
678
679
680 processGeneralTag(curi,sequence.subSequence(0,6),
681 sequence.subSequence(0,endOfOpenTag));
682
683
684
685 processScriptCode(
686 curi, sequence.subSequence(endOfOpenTag, sequence.length()));
687 }
688
689 /***
690 * Process metadata tags.
691 * @param curi CrawlURI we're processing.
692 * @param cs Sequence from underlying ReplayCharSequence. This
693 * is TRANSIENT data. Make a copy if you want the data to live outside
694 * of this extractors' lifetime.
695 * @return True robots exclusion metatag.
696 */
697 protected boolean processMeta(CrawlURI curi, CharSequence cs) {
698 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
699 String name = null;
700 String httpEquiv = null;
701 String content = null;
702 while (attr.find()) {
703 int valueGroup =
704 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
705 CharSequence value =
706 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
707 value = TextUtils.unescapeHtml(value);
708 if (attr.group(1).equalsIgnoreCase("name")) {
709 name = value.toString();
710 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
711 httpEquiv = value.toString();
712 } else if (attr.group(1).equalsIgnoreCase("content")) {
713 content = value.toString();
714 }
715
716 }
717 TextUtils.recycleMatcher(attr);
718
719
720 if("robots".equalsIgnoreCase(name) && content != null ) {
721 curi.putString(A_META_ROBOTS, content);
722 RobotsHonoringPolicy policy =
723 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
724 String contentLower = content.toLowerCase();
725 if ((policy == null
726 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
727 && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
728 && (contentLower.indexOf("nofollow") >= 0
729 || contentLower.indexOf("none") >= 0)) {
730
731
732 logger.fine("HTML extraction skipped due to robots meta-tag for: "
733 + curi.toString());
734 return true;
735 }
736 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
737 int urlIndex = content.indexOf("=") + 1;
738 if(urlIndex>0) {
739 String refreshUri = content.substring(urlIndex);
740 try {
741 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
742 Link.REFER_HOP);
743 } catch (URIException e) {
744 if (getController() != null) {
745 getController().logUriError(e, curi.getUURI(), refreshUri);
746 } else {
747 logger.info("Failed createAndAddLinkRelativeToBase " +
748 curi + ", " + cs + ", " + refreshUri + ": " + e);
749 }
750 }
751 }
752 }
753 return false;
754 }
755
756 /***
757 * Process style text.
758 * @param curi CrawlURI we're processing.
759 * @param sequence Sequence from underlying ReplayCharSequence. This
760 * is TRANSIENT data. Make a copy if you want the data to live outside
761 * of this extractors' lifetime.
762 * @param endOfOpenTag
763 */
764 protected void processStyle(CrawlURI curi, CharSequence sequence,
765 int endOfOpenTag) {
766
767 processGeneralTag(curi, sequence.subSequence(0,6),
768 sequence.subSequence(0,endOfOpenTag));
769
770
771 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
772 curi, sequence.subSequence(endOfOpenTag,sequence.length()),
773 getController());
774 }
775
776
777
778
779
780
781 public String report() {
782 StringBuffer ret = new StringBuffer();
783 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
784 ret.append(" Function: Link extraction on HTML documents\n");
785 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
786 ret.append(" Links extracted: " + this.numberOfLinksExtracted +
787 "\n\n");
788 return ret.toString();
789 }
790 }
791