1 /* Robots.java
2 *
3 * $Id: Robotstxt.java 5940 2008-08-01 21:14:16Z gojomo $
4 *
5 * Created Sep 1, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.Serializable;
30 import java.util.HashMap;
31 import java.util.LinkedList;
32 import java.util.List;
33 import java.util.Map;
34
35 /***
36 * Utility class for parsing and representing 'robots.txt' format
37 * directives, into a list of named user-agents and map from user-agents
38 * to RobotsDirectives.
39 */
40 public class Robotstxt implements Serializable {
41 static final long serialVersionUID = 7025386509301303890L;
42
43 // all user agents contained in this robots.txt
44 // may be thinned of irrelevant entries
45 LinkedList<String> userAgents = new LinkedList<String>();
46 // map user-agents to directives
47 Map<String,RobotsDirectives> agentsToDirectives =
48 new HashMap<String,RobotsDirectives>();
49 //
50 boolean hasErrors = false;
51
52 static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
53
54 public Robotstxt(BufferedReader reader) throws IOException {
55 String read;
56 // current is the disallowed paths for the preceding User-Agent(s)
57 RobotsDirectives current = null;
58 // whether a non-'User-Agent' directive has been encountered
59 boolean hasDirectivesYet = false;
60 String catchall = null;
61 while (reader != null) {
62 do {
63 read = reader.readLine();
64 // Skip comments & blanks
65 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
66 read.length() == 0));
67 if (read == null) {
68 reader.close();
69 reader = null;
70 } else {
71 // remove any html markup
72 read = read.replaceAll("<[^>]+>","");
73 int commentIndex = read.indexOf("#");
74 if (commentIndex > -1) {
75 // Strip trailing comment
76 read = read.substring(0, commentIndex);
77 }
78 read = read.trim();
79 if (read.matches("(?i)^User-agent:.*")) {
80 String ua = read.substring(11).trim().toLowerCase();
81 if (current == null || hasDirectivesYet ) {
82 // only create new rules-list if necessary
83 // otherwise share with previous user-agent
84 current = new RobotsDirectives();
85 hasDirectivesYet = false;
86 }
87 if (ua.equals("*")) {
88 ua = "";
89 catchall = ua;
90 } else {
91 userAgents.addLast(ua);
92 }
93 agentsToDirectives.put(ua, current);
94 continue;
95 }
96 if (read.matches("(?i)Disallow:.*")) {
97 if (current == null) {
98 // buggy robots.txt
99 hasErrors = true;
100 continue;
101 }
102 String path = read.substring(9).trim();
103 current.addDisallow(path);
104 hasDirectivesYet = true;
105 continue;
106 }
107 if (read.matches("(?i)Crawl-delay:.*")) {
108 if (current == null) {
109 // buggy robots.txt
110 hasErrors = true;
111 continue;
112 }
113 // consider a crawl-delay, even though we don't
114 // yet understand it, as sufficient to end a
115 // grouping of User-Agent lines
116 hasDirectivesYet = true;
117 String val = read.substring(12).trim();
118 val = val.split("[^//d//.]+")[0];
119 try {
120 current.setCrawlDelay(Float.parseFloat(val));
121 } catch (NumberFormatException nfe) {
122 // ignore
123 }
124 continue;
125 }
126 if (read.matches("(?i)Allow:.*")) {
127 if (current == null) {
128 // buggy robots.txt
129 hasErrors = true;
130 continue;
131 }
132 String path = read.substring(6).trim();
133 current.addAllow(path);
134 hasDirectivesYet = true;
135 continue;
136 }
137 // unknown line; do nothing for now
138 }
139 }
140
141 if (catchall != null) {
142 userAgents.addLast(catchall);
143 }
144 }
145
146
147 /***
148 * Does this policy effectively allow everything? (No
149 * disallows or timing (crawl-delay) directives?)
150 * @return
151 */
152 public boolean allowsAll() {
153 // TODO: refine so directives that are all empty are also
154 // recognized as allowing all
155 return agentsToDirectives.isEmpty();
156 }
157
158 public List<String> getUserAgents() {
159 return userAgents;
160 }
161
162 public RobotsDirectives getDirectivesFor(String ua) {
163 // find matching ua
164 for(String uaListed : userAgents) {
165 if(ua.indexOf(uaListed)>-1) {
166 return agentsToDirectives.get(uaListed);
167 }
168 }
169 // no applicable user-agents, so empty directives
170 return NO_DIRECTIVES;
171 }
172 }