View Javadoc

1   /* Robots.java
2    *
3    * $Id: Robotstxt.java 5940 2008-08-01 21:14:16Z gojomo $
4    *
5    * Created Sep 1, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.Serializable;
30  import java.util.HashMap;
31  import java.util.LinkedList;
32  import java.util.List;
33  import java.util.Map;
34  
35  /***
36   * Utility class for parsing and representing 'robots.txt' format 
37   * directives, into a list of named user-agents and map from user-agents 
38   * to RobotsDirectives. 
39   */
40  public class Robotstxt implements Serializable {
41      static final long serialVersionUID = 7025386509301303890L;
42      
43      // all user agents contained in this robots.txt
44      // may be thinned of irrelevant entries
45      LinkedList<String> userAgents = new LinkedList<String>();
46      // map user-agents to directives
47      Map<String,RobotsDirectives> agentsToDirectives = 
48          new HashMap<String,RobotsDirectives>();
49      // 
50      boolean hasErrors = false;
51      
52      static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
53      
54      public Robotstxt(BufferedReader reader) throws IOException {
55          String read;
56          // current is the disallowed paths for the preceding User-Agent(s)
57          RobotsDirectives current = null;
58          // whether a non-'User-Agent' directive has been encountered
59          boolean hasDirectivesYet = false; 
60          String catchall = null;
61          while (reader != null) {
62              do {
63                  read = reader.readLine();
64                  // Skip comments & blanks
65              } while ((read != null) && ((read = read.trim()).startsWith("#") ||
66                  read.length() == 0));
67              if (read == null) {
68                  reader.close();
69                  reader = null;
70              } else {
71                  // remove any html markup
72                  read = read.replaceAll("<[^>]+>","");
73                  int commentIndex = read.indexOf("#");
74                  if (commentIndex > -1) {
75                      // Strip trailing comment
76                      read = read.substring(0, commentIndex);
77                  }
78                  read = read.trim();
79                  if (read.matches("(?i)^User-agent:.*")) {
80                      String ua = read.substring(11).trim().toLowerCase();
81                      if (current == null || hasDirectivesYet ) {
82                          // only create new rules-list if necessary
83                          // otherwise share with previous user-agent
84                          current = new RobotsDirectives();
85                          hasDirectivesYet = false; 
86                      }
87                      if (ua.equals("*")) {
88                          ua = "";
89                          catchall = ua;
90                      } else {
91                          userAgents.addLast(ua);
92                      }
93                      agentsToDirectives.put(ua, current);
94                      continue;
95                  }
96                  if (read.matches("(?i)Disallow:.*")) {
97                      if (current == null) {
98                          // buggy robots.txt
99                          hasErrors = true;
100                         continue;
101                     }
102                     String path = read.substring(9).trim();
103                     current.addDisallow(path);
104                     hasDirectivesYet = true; 
105                     continue;
106                 }
107                 if (read.matches("(?i)Crawl-delay:.*")) {
108                     if (current == null) {
109                         // buggy robots.txt
110                         hasErrors = true;
111                         continue;
112                     }
113                     // consider a crawl-delay, even though we don't 
114                     // yet understand it, as sufficient to end a 
115                     // grouping of User-Agent lines
116                     hasDirectivesYet = true;
117                     String val = read.substring(12).trim();
118                     val = val.split("[^//d//.]+")[0];
119                     try {
120                         current.setCrawlDelay(Float.parseFloat(val));
121                     } catch (NumberFormatException nfe) {
122                         // ignore
123                     }
124                     continue;
125                 }
126                 if (read.matches("(?i)Allow:.*")) {
127                     if (current == null) {
128                         // buggy robots.txt
129                         hasErrors = true;
130                         continue;
131                     }
132                     String path = read.substring(6).trim();
133                     current.addAllow(path);
134                     hasDirectivesYet = true;
135                     continue;
136                 }
137                 // unknown line; do nothing for now
138             }
139         }
140 
141         if (catchall != null) {
142             userAgents.addLast(catchall);
143         }
144     }
145 
146 
147     /***
148      * Does this policy effectively allow everything? (No 
149      * disallows or timing (crawl-delay) directives?)
150      * @return
151      */
152     public boolean allowsAll() {
153         // TODO: refine so directives that are all empty are also 
154         // recognized as allowing all
155         return agentsToDirectives.isEmpty();
156     }
157     
158     public List<String> getUserAgents() {
159         return userAgents;
160     }
161 
162     public RobotsDirectives getDirectivesFor(String ua) {
163         // find matching ua
164         for(String uaListed : userAgents) {
165             if(ua.indexOf(uaListed)>-1) {
166                 return agentsToDirectives.get(uaListed);
167             }
168         }
169         // no applicable user-agents, so empty directives
170         return NO_DIRECTIVES; 
171     }
172 }