1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.StringReader;
30
31 import junit.framework.TestCase;
32
33 public class RobotstxtTest extends TestCase {
34 public void testParseRobots() throws IOException {
35 BufferedReader reader = new BufferedReader(new StringReader("BLAH"));
36 Robotstxt r = new Robotstxt(reader);
37 assertFalse(r.hasErrors);
38 assertTrue(r.getUserAgents().size() == 0);
39
40 String agent = "archive.org_bot";
41 reader = new BufferedReader(
42 new StringReader("User-agent: " + agent + "\n" +
43 "Disallow: /cgi-bin/\n" +
44 "Disallow: /details/software\n"));
45 r = new Robotstxt(reader);
46 assertFalse(r.hasErrors);
47 assertTrue(r.getUserAgents().size() == 1);
48 assertTrue(r.agentsToDirectives.size() == 1);
49 assertEquals(r.getUserAgents().get(0), agent);
50
51 agent = "*";
52 reader = new BufferedReader(
53 new StringReader("User-agent: " + agent + "\n" +
54 "Disallow: /cgi-bin/\n" +
55 "Disallow: /details/software\n"));
56 r = new Robotstxt(reader);
57 assertFalse(r.hasErrors);
58 assertTrue(r.getUserAgents().size() == 1);
59 assertTrue(r.agentsToDirectives.size() == 1);
60 assertEquals(r.getUserAgents().get(0), "");
61 }
62
63 Robotstxt sampleRobots1() throws IOException {
64 BufferedReader reader = new BufferedReader(
65 new StringReader(
66 "User-agent: *\n" +
67 "Disallow: /cgi-bin/\n" +
68 "Disallow: /details/software\n" +
69 "\n"+
70 "User-agent: denybot\n" +
71 "Disallow: /\n" +
72 "\n"+
73 "User-agent: allowbot1\n" +
74 "Disallow: \n" +
75 "\n"+
76 "User-agent: allowbot2\n" +
77 "Disallow: /foo\n" +
78 "Allow: /\n"+
79 "\n"+
80 "User-agent: delaybot\n" +
81 "Disallow: /\n" +
82 "Crawl-Delay: 20\n"+
83 "Allow: /images/\n"
84 ));
85 return new Robotstxt(reader);
86 }
87
88 public void testDirectives() throws IOException {
89 Robotstxt r = sampleRobots1();
90
91 assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/path"));
92 assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/"));
93
94 assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/path"));
95 assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/"));
96 assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/foo"));
97
98 assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/path"));
99 assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/"));
100
101 assertTrue(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/path"));
102 assertFalse(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/cgi-bin/foo.pl"));
103
104 assertEquals(-1f,r.getDirectivesFor("Mozilla denybot 99.9").getCrawlDelay());
105
106 assertEquals(20f,r.getDirectivesFor("Mozilla delaybot 99.9").getCrawlDelay());
107 }
108
109 Robotstxt htmlMarkupRobots() throws IOException {
110 BufferedReader reader = new BufferedReader(
111 new StringReader(
112 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n"
113 +"<HEAD>\n"
114 +"<TITLE>/robots.txt</TITLE>\n"
115 +"<HEAD>\n"
116 +"<BODY>\n"
117 +"User-agent: *<BR>\n"
118 +"Disallow: /<BR>\n"
119 +"Crawl-Delay: 30<BR>\n"
120 +"\n"
121 +"</BODY>\n"
122 +"</HTML>\n"
123 ));
124 return new Robotstxt(reader);
125 }
126
127 /***
128 * Test handling of a robots.txt with extraneous HTML markup
129 * @throws IOException
130 */
131 public void testHtmlMarkupRobots() throws IOException {
132 Robotstxt r = htmlMarkupRobots();
133 assertFalse(r.getDirectivesFor("anybot").allows("/index.html"));
134 assertEquals(30f,r.getDirectivesFor("anybot").getCrawlDelay());
135 }
136 }