1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTMLExtractor.java
20   * Created on Jun 5, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.logging.Level;
30  import java.util.logging.Logger;
31  import java.util.regex.Matcher;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.datamodel.CoreAttributeConstants;
35  import org.archive.crawler.datamodel.CrawlURI;
36  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37  import org.archive.crawler.settings.SimpleType;
38  import org.archive.crawler.settings.Type;
39  import org.archive.io.ReplayCharSequence;
40  import org.archive.net.UURI;
41  import org.archive.net.UURIFactory;
42  import org.archive.util.DevUtils;
43  import org.archive.util.HttpRecorder;
44  import org.archive.util.TextUtils;
45  
46  /***
47   * Basic link-extraction, from an HTML content-body,
48   * using regular expressions.
49   *
50   * @author gojomo
51   *
52   */
53  public class ExtractorHTML extends Extractor
54  implements CoreAttributeConstants {
55  
56      private static final long serialVersionUID = 5855731422080471017L;
57  
58      private static Logger logger =
59          Logger.getLogger(ExtractorHTML.class.getName());
60  
61      /***
62       * Compiled relevant tag extractor.
63       *
64       * <p>
65       * This pattern extracts either:
66       * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
67       * <li> (2) &lt;style&gt;...&lt;/style&gt; or
68       * <li> (3) &lt;meta ...&gt; or
69       * <li> (4) any other open-tag with at least one attribute
70       * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
71       * <p>
72       * groups:
73       * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
74       * <li> 2: just script open tag
75       * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
76       * <li> 4: just style open tag
77       * <li> 5: entire other tag, without '<' '>'
78       * <li> 6: element
79       * <li> 7: META
80       * <li> 8: !-- comment --
81       */
82  // version w/ less unnecessary backtracking
83        private static final int MAX_ELEMENT_LENGTH =
84            Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85                ".maxElementNameLength", "1024"));
86        
87        static final String RELEVANT_TAG_EXTRACTOR =
88            "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
89            "|((style[^>]*+)>.*?</style)" + // 3, 4
90            "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" + // 5, 6, 7
91            "|(!--.*?--))>"; // 8 
92  
93  //    version w/ problems with unclosed script tags 
94  //    static final String RELEVANT_TAG_EXTRACTOR =
95  //    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?://w+))//s+.*?)|(!--.*?--))>";
96  
97  
98        
99  //    // this pattern extracts 'href' or 'src' attributes from
100 //    // any open-tag innards matched by the above
101 //    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
102 //     "(?is)(//w+)(?://s+|(?://s.*?//s))(?:(href)|(src))//s*=(?:(?://s*\"(.+?)\")|(?://s*'(.+?)')|(//S+))");
103 //
104 //    // this pattern extracts 'robots' attributes
105 //    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
106 //     "(?is)(//w+)//s+.*?(?:(robots))//s*=(?:(?://s*\"(.+)\")|(?://s*'(.+)')|(//S+))");
107 
108       private static final int MAX_ATTR_NAME_LENGTH =
109           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110               ".maxAttributeNameLength", "1024")); // 1K; 
111       
112       static final int MAX_ATTR_VAL_LENGTH = 
113           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114               ".maxAttributeValueLength", "16384")); // 16K; 
115       
116     // TODO: perhaps cut to near MAX_URI_LENGTH
117     
118     // this pattern extracts attributes from any open-tag innards
119     // matched by the above. attributes known to be URIs of various
120     // sorts are matched specially
121     static final String EACH_ATTRIBUTE_EXTRACTOR =
122       "(?is)//b((href)|(action)|(on//w*)" // 1, 2, 3, 4 
123      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124      +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126      +"|(value)|(style)|(method)" // 10, 11, 12
127      +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 13
128      +"//s*=//s*"
129      +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 14
130      +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 15
131      +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 16
132     // groups:
133     // 1: attribute name
134     // 2: HREF - single URI relative to doc base, or occasionally javascript:
135     // 3: ACTION - single URI relative to doc base, or occasionally javascript:
136     // 4: ON[WHATEVER] - script handler
137     // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
138     //    single URI relative to doc base
139     // 6: CODEBASE - a single URI relative to doc base, affecting other
140     //    attributes
141     // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
142     // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
143     //    (if supplied)
144     // 9: CODE - a single URI relative to the CODEBASE (is specified).
145     // 10: VALUE - often includes a uri path on forms
146     // 11: STYLE - inline attribute style info
147     // 12: METHOD - form GET/POST
148     // 13: any other attribute
149     // 14: double-quote delimited attr value
150     // 15: single-quote delimited attr value
151     // 16: space-delimited attr value
152 
153 
154     // much like the javascript likely-URI extractor, but
155     // without requiring quotes -- this can indicate whether
156     // an HTML tag attribute that isn't definitionally a
157     // URI might be one anyway, as in form-tag VALUE attributes
158     static final String LIKELY_URI_PATH =
159      "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
160     static final String WHITESPACE = "//s";
161     static final String CLASSEXT =".class";
162     static final String APPLET = "applet";
163     static final String BASE = "base";
164     static final String LINK = "link";
165     static final String FRAME = "frame";
166     static final String IFRAME = "iframe";
167 
168     public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
169         "treat-frames-as-embed-links";
170     
171     public static final String ATTR_IGNORE_FORM_ACTION_URLS =
172         "ignore-form-action-urls";
173 
174     public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
175         "extract-only-form-gets";
176 
177     /*** whether to try finding links in Javscript; default true */
178     public static final String ATTR_EXTRACT_JAVASCRIPT =
179         "extract-javascript";
180 
181     public static final String EXTRACT_VALUE_ATTRIBUTES =
182         "extract-value-attributes";
183     
184     public static final String ATTR_IGNORE_UNEXPECTED_HTML = 
185         "ignore-unexpected-html";
186 
187     
188     protected long numberOfCURIsHandled = 0;
189     protected long numberOfLinksExtracted = 0;
190 
191     public ExtractorHTML(String name) {
192         this(name, "HTML extractor. Extracts links from HTML documents");
193     }
194     
195     public ExtractorHTML(String name, String description) {
196         super(name, description);
197         Type t = addElementToDefinition(
198             new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
199             "If true, in-page Javascript is scanned for strings that " +
200             "appear likely to be URIs. This typically finds both valid " +
201             "and invalid URIs, and attempts to fetch the invalid URIs " +
202             "sometimes generates webmaster concerns over odd crawler " +
203             "behavior. Default is true.",
204             Boolean.TRUE));
205         t.setExpertSetting(true);
206         t = addElementToDefinition(
207             new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
208             "If true, FRAME/IFRAME SRC-links are treated as embedded " +
209             "resources (like IMG, 'E' hop-type), otherwise they are " +
210             "treated as navigational links. Default is true.", Boolean.TRUE));
211         t.setExpertSetting(true);
212         t = addElementToDefinition(
213             new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
214             "If true, URIs appearing as the ACTION attribute in " +
215             "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
216         t.setExpertSetting(true);
217         t = addElementToDefinition(
218                 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
219                 "If true, only HTML FORM ACTIONs associated with the GET "+ 
220                 "method are extracted. (Form ACTIONs with method POST "+
221                 "will be ignored. Default is true", Boolean.TRUE));
222         t.setExpertSetting(true);
223         t = addElementToDefinition(
224             new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
225             "If true, strings that look like URIs found in element VALUE " +
226             "attributes (which are sometimes used as URIs by in-page " +
227             "Javascript or server-side redirects) will be extracted. " +
228             "This typically finds both valid and invalid URIs, and " +
229             "attempts to fetch the invalid URIs sometimes generate " +
230             "webmaster concerns over odd crawler behavior. Default " +
231             "is true.",
232             Boolean.TRUE));
233         t.setExpertSetting(true);
234         t = addElementToDefinition(
235             new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
236             "If true, URIs which end in typical non-HTML extensions " +
237             "(such as .gif) will not be scanned as if it were HTML. " +
238             "Default is true.", Boolean.TRUE));
239         t.setExpertSetting(true);
240     }
241 
242     protected void processGeneralTag(CrawlURI curi, CharSequence element,
243             CharSequence cs) {
244 
245         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
246 
247         // Just in case it's an OBJECT or APPLET tag
248         String codebase = null;
249         ArrayList<String> resources = null;
250         
251         // Just in case it's a FORM
252         CharSequence action = null;
253         CharSequence actionContext = null;
254         CharSequence method = null; 
255         
256         final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
257             ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
258 
259         final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
260                 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
261         
262         final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
263                 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
264         
265         final String elementStr = element.toString();
266 
267         while (attr.find()) {
268             int valueGroup =
269                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270             int start = attr.start(valueGroup);
271             int end = attr.end(valueGroup);
272             assert start >= 0: "Start is: " + start + ", " + curi;
273             assert end >= 0: "End is :" + end + ", " + curi;
274             CharSequence value = cs.subSequence(start, end);
275             value = TextUtils.unescapeHtml(value);
276             if (attr.start(2) > -1) {
277                 // HREF
278                 CharSequence context =
279                     Link.elementContext(element, attr.group(2));
280                 if(elementStr.equalsIgnoreCase(LINK)) {
281                     // <LINK> elements treated as embeds (css, ico, etc)
282                     processEmbed(curi, value, context);
283                 } else {
284                     // other HREFs treated as links
285                     processLink(curi, value, context);
286                 }
287                 if (elementStr.equalsIgnoreCase(BASE)) {
288                     try {
289                         curi.setBaseURI(value.toString());
290                     } catch (URIException e) {
291                         if (getController() != null) {
292                             // Controller can be null: e.g. when running
293                             // ExtractorTool.
294                             getController().logUriError(e, curi.getUURI(),
295                                 value.toString());
296                         } else {
297                             logger.info("Failed set base uri: " +
298                                 curi + ", " + value.toString() + ": " +
299                                 e.getMessage());
300                         }
301                     }
302                 }
303             } else if (attr.start(3) > -1) {
304                 // ACTION
305                 if (!ignoreFormActions) {
306                     action = value; 
307                     actionContext = Link.elementContext(element,
308                         attr.group(3));
309                     // handling finished only at end (after METHOD also collected)
310                 }
311             } else if (attr.start(4) > -1) {
312                 // ON____
313                 processScriptCode(curi, value); // TODO: context?
314             } else if (attr.start(5) > -1) {
315                 // SRC etc.
316                 CharSequence context = Link.elementContext(element,
317                     attr.group(5));
318                 
319                 // true, if we expect another HTML page instead of an image etc.
320                 final char hopType;
321                 
322                 if(!framesAsEmbeds
323                     && (elementStr.equalsIgnoreCase(FRAME) || elementStr
324                         .equalsIgnoreCase(IFRAME))) {
325                     hopType = Link.NAVLINK_HOP;
326                 } else {
327                     hopType = Link.EMBED_HOP;
328                 }
329                 processEmbed(curi, value, context, hopType);
330             } else if (attr.start(6) > -1) {
331                 // CODEBASE
332                 codebase = (value instanceof String)?
333                     (String)value: value.toString();
334                 CharSequence context = Link.elementContext(element,
335                     attr.group(6));
336                 processEmbed(curi, codebase, context);
337             } else if (attr.start(7) > -1) {
338                 // CLASSID, DATA
339                 if (resources == null) {
340                     resources = new ArrayList<String>();
341                 }
342                 resources.add(value.toString());
343             } else if (attr.start(8) > -1) {
344                 // ARCHIVE
345                 if (resources==null) {
346                     resources = new ArrayList<String>();
347                 }
348                 String[] multi = TextUtils.split(WHITESPACE, value);
349                 for(int i = 0; i < multi.length; i++ ) {
350                     resources.add(multi[i]);
351                 }
352             } else if (attr.start(9) > -1) {
353                 // CODE
354                 if (resources==null) {
355                     resources = new ArrayList<String>();
356                 }
357                 // If element is applet and code value does not end with
358                 // '.class' then append '.class' to the code value.
359                 if (elementStr.equalsIgnoreCase(APPLET) &&
360                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
361                     resources.add(value.toString() + CLASSEXT);
362                 } else {
363                     resources.add(value.toString());
364                 }
365             } else if (attr.start(10) > -1) {
366                 // VALUE, with possibility of URI
367                 if (extractValueAttributes 
368                         && TextUtils.matches(LIKELY_URI_PATH, value)) {
369                     CharSequence context = Link.elementContext(element,
370                         attr.group(10));
371                     processLink(curi,value, context);
372                 }
373 
374             } else if (attr.start(11) > -1) {
375                 // STYLE inline attribute
376                 // then, parse for URIs
377                 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
378                     curi, value, getController());
379                 
380             } else if (attr.start(12) > -1) {
381                 // METHOD
382                 method = value;
383                 // form processing finished at end (after ACTION also collected)
384             } else if (attr.start(13) > -1) {
385                 // any other attribute
386                 // ignore for now
387                 // could probe for path- or script-looking strings, but
388                 // those should be vanishingly rare in other attributes,
389                 // and/or symptomatic of page bugs
390             }
391         }
392         TextUtils.recycleMatcher(attr);
393 
394         // finish handling codebase/resources now that all available
395         if (resources != null) {
396             Iterator iter = resources.iterator();
397             UURI codebaseURI = null;
398             String res = null;
399             try {
400                 if (codebase != null) {
401                     // TODO: Pass in the charset.
402                     codebaseURI = UURIFactory.
403                         getInstance(curi.getUURI(), codebase);
404                 }
405                 while(iter.hasNext()) {
406                     res = iter.next().toString();
407                     res = (String) TextUtils.unescapeHtml(res);
408                     if (codebaseURI != null) {
409                         res = codebaseURI.resolve(res).toString();
410                     }
411                     processEmbed(curi, res, element); // TODO: include attribute too
412                 }
413             } catch (URIException e) {
414                 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
415             } catch (IllegalArgumentException e) {
416                 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
417                     "codebase=" + codebase + " res=" + res + "\n" +
418                     DevUtils.extraInfo(), e);
419             }
420         }
421         
422         // finish handling form action, now method is available
423         if(action != null) {
424             if(method == null || "GET".equalsIgnoreCase(method.toString()) 
425                     || ! ((Boolean)getUncheckedAttribute(curi,
426                             ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
427                 processLink(curi, action, actionContext);
428             }
429         }
430     }
431 
432     /***
433      * Extract the (java)script source in the given CharSequence. 
434      * 
435      * @param curi source CrawlURI
436      * @param cs CharSequence of javascript code
437      */
438     protected void processScriptCode(CrawlURI curi, CharSequence cs) {
439         if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
440             this.numberOfLinksExtracted +=
441                 ExtractorJS.considerStrings(curi, cs, getController(), false);
442         } // else do nothing
443     }
444 
445     static final String JAVASCRIPT = "(?i)^javascript:.*";
446 
447     /***
448      * Handle generic HREF cases.
449      * 
450      * @param curi
451      * @param value
452      * @param context
453      */
454     protected void processLink(CrawlURI curi, final CharSequence value,
455             CharSequence context) {
456         if (TextUtils.matches(JAVASCRIPT, value)) {
457             processScriptCode(curi, value. subSequence(11, value.length()));
458         } else {    
459             if (logger.isLoggable(Level.FINEST)) {
460                 logger.finest("link: " + value.toString() + " from " + curi);
461             }
462             addLinkFromString(curi,
463                 (value instanceof String)?
464                     (String)value: value.toString(),
465                 context, Link.NAVLINK_HOP);
466             this.numberOfLinksExtracted++;
467         }
468     }
469 
470     private void addLinkFromString(CrawlURI curi, String uri,
471             CharSequence context, char hopType) {
472         try {
473             // We do a 'toString' on context because its a sequence from
474             // the underlying ReplayCharSequence and the link its about
475             // to become a part of is expected to outlive the current
476             // ReplayCharSequence.
477             curi.createAndAddLinkRelativeToBase(uri, context.toString(),
478                 hopType);
479         } catch (URIException e) {
480             if (getController() != null) {
481                 getController().logUriError(e, curi.getUURI(), uri);
482             } else {
483                 logger.info("Failed createAndAddLinkRelativeToBase " +
484                     curi + ", " + uri + ", " + context + ", " + hopType +
485                     ": " + e);
486             }
487         }
488     }
489 
490     protected final void processEmbed(CrawlURI curi, CharSequence value,
491             CharSequence context) {
492         processEmbed(curi, value, context, Link.EMBED_HOP);
493     }
494 
495     protected void processEmbed(CrawlURI curi, final CharSequence value,
496             CharSequence context, char hopType) {
497         if (logger.isLoggable(Level.FINEST)) {
498             logger.finest("embed (" + hopType + "): " + value.toString() +
499                 " from " + curi);
500         }
501         addLinkFromString(curi,
502             (value instanceof String)?
503                 (String)value: value.toString(),
504             context, hopType);
505         this.numberOfLinksExtracted++;
506     }
507 
508     public void extract(CrawlURI curi) {
509         if (!isHttpTransactionContentToProcess(curi) ||
510                 ! (isExpectedMimeType(curi.getContentType(), "text/html")
511                    || isExpectedMimeType(curi.getContentType(), "application/xhtml")
512                    || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml")
513                    || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml")
514                    || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) {
515             return;
516         }
517 
518         final boolean ignoreUnexpectedHTML =
519              ((Boolean)getUncheckedAttribute(curi, 
520                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();        
521 
522         if (ignoreUnexpectedHTML) {
523             try {
524                 if(!isHtmlExpectedHere(curi)) {
525                     // HTML was not expected (eg a GIF was expected) so ignore
526                     // (as if a soft 404)
527                     return;
528                 }
529             } catch (URIException e) {
530                 logger.severe("Failed expectedHTML test: " + e.getMessage());
531             }
532         }
533 
534         this.numberOfCURIsHandled++;
535 
536         ReplayCharSequence cs = null;
537         
538         try {
539            HttpRecorder hr = curi.getHttpRecorder();
540            if (hr == null) {
541                throw new IOException("Why is recorder null here?");
542            }
543            cs = hr.getReplayCharSequence();
544         } catch (IOException e) {
545             curi.addLocalizedError(this.getName(), e,
546                 "Failed get of replay char sequence " + curi.toString() +
547                     " " + e.getMessage());
548             logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
549                 Thread.currentThread().getName(), e);
550         }
551         
552         if (cs == null) {
553             return;
554         }
555 
556         // We have a ReplayCharSequence open.  Wrap all in finally so we
557         // for sure close it before we leave.
558         try {
559             // Extract all links from the charsequence
560             extract(curi, cs);
561             // Set flag to indicate that link extraction is completed.
562             curi.linkExtractorFinished();
563         } finally {
564             if (cs != null) {
565                 try {
566                     cs.close();
567                 } catch (IOException ioe) {
568                     logger.warning(TextUtils.exceptionToString(
569                         "Failed close of ReplayCharSequence.", ioe));
570                 }
571             }
572         }
573     }
574 
575     /***
576      * Run extractor.
577      * This method is package visible to ease testing.
578      * @param curi CrawlURI we're processing.
579      * @param cs Sequence from underlying ReplayCharSequence. This
580      * is TRANSIENT data. Make a copy if you want the data to live outside
581      * of this extractors' lifetime.
582      */
583     void extract(CrawlURI curi, CharSequence cs) {
584         Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
585         while(tags.find()) {
586             if(Thread.interrupted()){
587                 break;
588             }
589             if (tags.start(8) > 0) {
590                 // comment match
591                 // for now do nothing
592             } else if (tags.start(7) > 0) {
593                 // <meta> match
594                 int start = tags.start(5);
595                 int end = tags.end(5);
596                 assert start >= 0: "Start is: " + start + ", " + curi;
597                 assert end >= 0: "End is :" + end + ", " + curi;
598                 if (processMeta(curi,
599                     cs.subSequence(start, end))) {
600 
601                     // meta tag included NOFOLLOW; abort processing
602                     break;
603                 }
604             } else if (tags.start(5) > 0) {
605                 // generic <whatever> match
606                 int start5 = tags.start(5);
607                 int end5 = tags.end(5);
608                 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
609                 assert end5 >= 0: "End is :" + end5 + ", " + curi;
610                 int start6 = tags.start(6);
611                 int end6 = tags.end(6);
612                 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
613                 assert end6 >= 0: "End is :" + end6 + ", " + curi;
614                 processGeneralTag(curi,
615                     cs.subSequence(start6, end6),
616                     cs.subSequence(start5, end5));
617 
618             } else if (tags.start(1) > 0) {
619                 // <script> match
620                 int start = tags.start(1);
621                 int end = tags.end(1);
622                 assert start >= 0: "Start is: " + start + ", " + curi;
623                 assert end >= 0: "End is :" + end + ", " + curi;
624                 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
625                     ", " + curi;
626                 processScript(curi, cs.subSequence(start, end),
627                     tags.end(2) - start);
628 
629             } else if (tags.start(3) > 0){
630                 // <style... match
631                 int start = tags.start(3);
632                 int end = tags.end(3);
633                 assert start >= 0: "Start is: " + start + ", " + curi;
634                 assert end >= 0: "End is :" + end + ", " + curi;
635                 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
636                     ", " + curi;
637                 processStyle(curi, cs.subSequence(start, end),
638                     tags.end(4) - start);
639             }
640         }
641         TextUtils.recycleMatcher(tags);
642     }
643 
644 
645     static final String NON_HTML_PATH_EXTENSION =
646         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
647         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
648 
649     /***
650      * Test whether this HTML is so unexpected (eg in place of a GIF URI)
651      * that it shouldn't be scanned for links.
652      *
653      * @param curi CrawlURI to examine.
654      * @return True if HTML is acceptable/expected here
655      * @throws URIException
656      */
657     protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
658         String path = curi.getUURI().getPath();
659         if(path==null) {
660             // no path extension, HTML is fine
661             return true;
662         }
663         int dot = path.lastIndexOf('.');
664         if (dot < 0) {
665             // no path extension, HTML is fine
666             return true;
667         }
668         if(dot<(path.length()-5)) {
669             // extension too long to recognize, HTML is fine
670             return true;
671         }
672         String ext = path.substring(dot+1);
673         return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
674     }
675 
676     protected void processScript(CrawlURI curi, CharSequence sequence,
677             int endOfOpenTag) {
678         // first, get attributes of script-open tag
679         // as per any other tag
680         processGeneralTag(curi,sequence.subSequence(0,6),
681             sequence.subSequence(0,endOfOpenTag));
682 
683         // then, apply best-effort string-analysis heuristics
684         // against any code present (false positives are OK)
685         processScriptCode(
686             curi, sequence.subSequence(endOfOpenTag, sequence.length()));
687     }
688 
689     /***
690      * Process metadata tags.
691      * @param curi CrawlURI we're processing.
692      * @param cs Sequence from underlying ReplayCharSequence. This
693      * is TRANSIENT data. Make a copy if you want the data to live outside
694      * of this extractors' lifetime.
695      * @return True robots exclusion metatag.
696      */
697     protected boolean processMeta(CrawlURI curi, CharSequence cs) {
698         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
699         String name = null;
700         String httpEquiv = null;
701         String content = null;
702         while (attr.find()) {
703             int valueGroup =
704                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
705             CharSequence value =
706                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
707             value = TextUtils.unescapeHtml(value);
708             if (attr.group(1).equalsIgnoreCase("name")) {
709                 name = value.toString();
710             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
711                 httpEquiv = value.toString();
712             } else if (attr.group(1).equalsIgnoreCase("content")) {
713                 content = value.toString();
714             }
715             // TODO: handle other stuff
716         }
717         TextUtils.recycleMatcher(attr);
718 
719         // Look for the 'robots' meta-tag
720         if("robots".equalsIgnoreCase(name) && content != null ) {
721             curi.putString(A_META_ROBOTS, content);
722             RobotsHonoringPolicy policy =
723                 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
724             String contentLower = content.toLowerCase();
725             if ((policy == null
726                 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
727                     && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
728                 && (contentLower.indexOf("nofollow") >= 0
729                     || contentLower.indexOf("none") >= 0)) {
730                 // if 'nofollow' or 'none' is specified and the
731                 // honoring policy is not IGNORE or CUSTOM, end html extraction
732                 logger.fine("HTML extraction skipped due to robots meta-tag for: "
733                                 + curi.toString());
734                 return true;
735             }
736         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
737             int urlIndex = content.indexOf("=") + 1;
738             if(urlIndex>0) {
739                 String refreshUri = content.substring(urlIndex);
740                 try {
741                     curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
742                         Link.REFER_HOP);
743                 } catch (URIException e) {
744                     if (getController() != null) {
745                         getController().logUriError(e, curi.getUURI(), refreshUri);
746                     } else {
747                         logger.info("Failed createAndAddLinkRelativeToBase " +
748                             curi + ", " + cs + ", " + refreshUri + ": " + e);
749                     }
750                 }
751             }
752         }
753         return false;
754     }
755 
756     /***
757      * Process style text.
758      * @param curi CrawlURI we're processing.
759      * @param sequence Sequence from underlying ReplayCharSequence. This
760      * is TRANSIENT data. Make a copy if you want the data to live outside
761      * of this extractors' lifetime.
762      * @param endOfOpenTag
763      */
764     protected void processStyle(CrawlURI curi, CharSequence sequence,
765             int endOfOpenTag) {
766         // First, get attributes of script-open tag as per any other tag.
767         processGeneralTag(curi, sequence.subSequence(0,6),
768             sequence.subSequence(0,endOfOpenTag));
769 
770         // then, parse for URIs
771         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
772             curi, sequence.subSequence(endOfOpenTag,sequence.length()),
773                 getController());
774     }
775     
776 
777 
778     /* (non-Javadoc)
779      * @see org.archive.crawler.framework.Processor#report()
780      */
781     public String report() {
782         StringBuffer ret = new StringBuffer();
783         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
784         ret.append("  Function:          Link extraction on HTML documents\n");
785         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
786         ret.append("  Links extracted:   " + this.numberOfLinksExtracted +
787             "\n\n");
788         return ret.toString();
789     }
790 }
791