View Javadoc

1   /* AbstractFrontier
2    *
3    * $Id: AbstractFrontier.java 5882 2008-07-17 21:02:28Z gojomo $
4    *
5    * Created on Aug 17, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.frontier;
26  
27  import java.io.BufferedWriter;
28  import java.io.File;
29  import java.io.FileWriter;
30  import java.io.IOException;
31  import java.io.PrintWriter;
32  import java.io.Serializable;
33  import java.io.StringWriter;
34  import java.io.Writer;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.concurrent.atomic.AtomicLong;
38  import java.util.logging.Level;
39  import java.util.logging.Logger;
40  import java.util.regex.Pattern;
41  
42  import javax.management.AttributeNotFoundException;
43  
44  import org.apache.commons.httpclient.HttpStatus;
45  import org.archive.crawler.datamodel.CandidateURI;
46  import org.archive.crawler.datamodel.CoreAttributeConstants;
47  import org.archive.crawler.datamodel.CrawlHost;
48  import org.archive.crawler.datamodel.CrawlOrder;
49  import org.archive.crawler.datamodel.CrawlServer;
50  import org.archive.crawler.datamodel.CrawlSubstats;
51  import org.archive.crawler.datamodel.CrawlURI;
52  import org.archive.crawler.datamodel.FetchStatusCodes;
53  import org.archive.crawler.datamodel.RobotsExclusionPolicy;
54  import org.archive.crawler.datamodel.CrawlSubstats.Stage;
55  import org.archive.crawler.event.CrawlStatusListener;
56  import org.archive.crawler.framework.CrawlController;
57  import org.archive.crawler.framework.Frontier;
58  import org.archive.crawler.framework.ToeThread;
59  import org.archive.crawler.framework.exceptions.EndedException;
60  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
61  import org.archive.crawler.settings.ModuleType;
62  import org.archive.crawler.settings.RegularExpressionConstraint;
63  import org.archive.crawler.settings.SimpleType;
64  import org.archive.crawler.settings.Type;
65  import org.archive.crawler.url.Canonicalizer;
66  import org.archive.net.UURI;
67  import org.archive.util.ArchiveUtils;
68  
69  /***
70   * Shared facilities for Frontier implementations.
71   * 
72   * @author gojomo
73   */
74  public abstract class AbstractFrontier extends ModuleType
75  implements CrawlStatusListener, Frontier, FetchStatusCodes,
76          CoreAttributeConstants, Serializable {
77      private static final long serialVersionUID = -4766504935003203930L;
78  
79      private static final Logger logger = Logger
80              .getLogger(AbstractFrontier.class.getName());
81  
82      protected transient CrawlController controller;
83  
84      /*** ordinal numbers to assign to created CrawlURIs */
85      protected AtomicLong nextOrdinal = new AtomicLong(1); 
86  
87      /*** should the frontier hold any threads asking for URIs? */
88      protected boolean shouldPause = false;
89  
90      /***
91       * should the frontier send an EndedException to any threads asking for
92       * URIs?
93       */
94      protected transient boolean shouldTerminate = false;
95  
96      /***
97       * how many multiples of last fetch elapsed time to wait before recontacting
98       * same server
99       */
100     public final static String ATTR_DELAY_FACTOR = "delay-factor";
101 
102     protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);
103 
104     /***
105      * always wait this long after one completion before recontacting same
106      * server, regardless of multiple
107      */
108     public final static String ATTR_MIN_DELAY = "min-delay-ms";
109 
110     // 3 secs.
111     protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);
112 
113     /***
114      * Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
115      * robots.txt
116      */
117     public final static String 
118         ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS = "respect-crawl-delay-up-to-secs";
119 
120     // by default, respect robots.txt-provided Crawl-Delay up to 300 secs
121     protected final static Integer 
122         DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS = 300; // 5 minutes
123     
124     /*** never wait more than this long, regardless of multiple */
125     public final static String ATTR_MAX_DELAY = "max-delay-ms";
126 
127     // 30 secs
128     protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
129 
130     /*** number of hops of embeds (ERX) to bump to front of host queue */
131     public final static String ATTR_PREFERENCE_EMBED_HOPS =
132         "preference-embed-hops";
133 
134     protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS =
135         new Integer(1);
136 
137     /*** maximum per-host bandwidth usage */
138     public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE =
139         "max-per-host-bandwidth-usage-KB-sec";
140 
141     protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE =
142         new Integer(0);
143 
144     /*** maximum overall bandwidth usage */
145     public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE =
146         "total-bandwidth-usage-KB-sec";
147 
148     protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE =
149         new Integer(0);
150 
151     /*** for retryable problems, seconds to wait before a retry */
152     public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
153 
154     // 15 mins
155     protected final static Long DEFAULT_RETRY_DELAY = new Long(900);
156 
157     /*** maximum times to emit a CrawlURI without final disposition */
158     public final static String ATTR_MAX_RETRIES = "max-retries";
159 
160     protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
161 
162     public final static String ATTR_QUEUE_ASSIGNMENT_POLICY =
163         "queue-assignment-policy";
164 
165     /*** queue assignment to force onto CrawlURIs; intended to be overridden */
166     public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
167 
168     protected final static String DEFAULT_FORCE_QUEUE = "";
169 
170     // word chars, dash, period, comma, colon
171     protected final static String ACCEPTABLE_FORCE_QUEUE = "[-//w//.,:]*";
172         
173     /*** whether pause, rather than finish, when crawl appears done */
174     public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";
175     // TODO: change default to true once well-tested
176     protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;
177     
178     /*** whether to pause at crawl start */
179     public final static String ATTR_PAUSE_AT_START = "pause-at-start";
180     protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;
181     
182     /*** whether to pause at crawl start */
183     public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";
184     protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;
185 
186     /***
187      * Recover log on or off attribute.
188      */
189     protected final static String ATTR_RECOVERY_ENABLED =
190         "recovery-log-enabled";
191     protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED =
192         Boolean.TRUE;
193 
194     // to maintain serialization compatibility, stored under old names
195     protected long queuedUriCount;
196     protected long succeededFetchCount;
197     protected long failedFetchCount;
198     protected long disregardedUriCount;
199     
200     // top-level stats
201     /*** total URIs queued to be visited */
202     transient protected AtomicLong liveQueuedUriCount = new AtomicLong(0); 
203 
204     transient protected AtomicLong liveSucceededFetchCount = new AtomicLong(0);
205 
206     transient protected AtomicLong liveFailedFetchCount = new AtomicLong(0);
207 
208     /*** URIs that are disregarded (for example because of robot.txt rules */
209     transient protected AtomicLong liveDisregardedUriCount = new AtomicLong(0);
210 
211     /***
212      * Used when bandwidth constraint are used.
213      */
214     protected long totalProcessedBytes = 0;
215 
216     private transient long nextURIEmitTime = 0;
217 
218     protected long processedBytesAfterLastEmittedURI = 0;
219     
220     protected int lastMaxBandwidthKB = 0;
221 
222     /***
223      * Crawl replay logger.
224      * 
225      * Currently captures Frontier/URI transitions.
226      * Can be null if user chose not to run a recovery.log.
227      */
228     private transient FrontierJournal recover = null;
229 
230     /*** file collecting report of ignored seed-file entries (if any) */
231     public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";
232 
233     /***
234      * @param name Name of this frontier.
235      * @param description Description for this frontier.
236      */
237     public AbstractFrontier(String name, String description) {
238         super(name, description);
239         addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
240                 "How many multiples of last fetch elapsed time to wait before "
241                         + "recontacting same server", DEFAULT_DELAY_FACTOR));
242         addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
243                 "Never wait more than this long.", DEFAULT_MAX_DELAY));
244         addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
245                 "Always wait this long after one completion before recontacting "
246                         + "same server.", DEFAULT_MIN_DELAY));
247         addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS,
248                 "Respect a Crawl-Delay directive in a site's robots.txt "
249                 +"up to this value in seconds. (If longer, simply "
250                 +"respect this value.) Default is 300 seconds (5 minutes).", 
251                 DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS));
252         addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
253                 "How often to retry fetching a URI that failed to be retrieved. "
254                         + "If zero, the crawler will get the robots.txt only.",
255                 DEFAULT_MAX_RETRIES));
256         addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
257                 "How long to wait by default until we retry fetching a"
258                         + " URI that failed to be retrieved (seconds). ",
259                 DEFAULT_RETRY_DELAY));
260         addElementToDefinition(new SimpleType(
261                 ATTR_PREFERENCE_EMBED_HOPS,
262                 "Number of embedded (or redirected) hops up to which "
263                 + "a URI has higher priority scheduling. For example, if set "
264                 + "to 1 (the default), items such as inline images (1-hop "
265                 + "embedded resources) will be scheduled ahead of all regular "
266                 + "links (or many-hop resources, like nested frames). If set to "
267                 + "zero, no preferencing will occur, and embeds/redirects are "
268                 + "scheduled the same as regular links.",
269                 DEFAULT_PREFERENCE_EMBED_HOPS));
270         Type t;
271         t = addElementToDefinition(new SimpleType(
272                 ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
273                 "The maximum average bandwidth the crawler is allowed to use. "
274                 + "The actual read speed is not affected by this setting, it only "
275                 + "holds back new URIs from being processed when the bandwidth "
276                 + "usage has been to high. 0 means no bandwidth limitation.",
277                 DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
278         t.setOverrideable(false);
279         t = addElementToDefinition(new SimpleType(
280                 ATTR_MAX_HOST_BANDWIDTH_USAGE,
281                 "The maximum average bandwidth the crawler is allowed to use per "
282                 + "host. The actual read speed is not affected by this setting, "
283                 + "it only holds back new URIs from being processed when the "
284                 + "bandwidth usage has been to high. 0 means no bandwidth "
285                 + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
286         t.setExpertSetting(true);
287 
288         // Read the list of permissible choices from heritrix.properties.
289         // Its a list of space- or comma-separated values.
290         String queueStr = System.getProperty(AbstractFrontier.class.getName() +
291                 "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
292                 HostnameQueueAssignmentPolicy.class.getName() + " " +
293                 IPQueueAssignmentPolicy.class.getName() + " " +
294                 BucketQueueAssignmentPolicy.class.getName() + " " +
295                 SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +
296                 TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
297         Pattern p = Pattern.compile("//s*,//s*|//s+");
298         String [] queues = p.split(queueStr);
299         if (queues.length <= 0) {
300             throw new RuntimeException("Failed parse of " +
301                     " assignment queue policy string: " + queueStr);
302         }
303         t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
304                 "Defines how to assign URIs to queues. Can assign by host, " +
305                 "by ip, and into one of a fixed set of buckets (1k).",
306                 queues[0], queues));
307         t.setExpertSetting(true);
308         t.setOverrideable(true);
309 
310         t = addElementToDefinition(new SimpleType(
311                 ATTR_FORCE_QUEUE,
312                 "The queue name into which to force URIs. Should "
313                 + "be left blank at global level.  Specify a "
314                 + "per-domain/per-host override to force URIs into "
315                 + "a particular named queue, regardless of the assignment "
316                 + "policy in effect (domain or ip-based politeness). "
317                 + "This could be used on domains known to all be from "
318                 + "the same small set of IPs (eg blogspot, dailykos, etc.) "
319                 + "to simulate IP-based politeness, or it could be used if "
320                 + "you wanted to enforce politeness over a whole domain, even "
321                 + "though the subdomains are split across many IPs.",
322                 DEFAULT_FORCE_QUEUE));
323         t.setOverrideable(true);
324         t.setExpertSetting(true);
325         t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
326                 Level.WARNING, "This field must contain only alphanumeric "
327                 + "characters plus period, dash, comma, colon, or underscore."));
328         t = addElementToDefinition(new SimpleType(
329                 ATTR_PAUSE_AT_START,
330                 "Whether to pause when the crawl begins, before any URIs " +
331                 "are tried. This gives the operator a chance to verify or " +
332                 "adjust the crawl before actual work begins. " +
333                 "Default is false.", DEFAULT_PAUSE_AT_START));
334         t = addElementToDefinition(new SimpleType(
335                 ATTR_PAUSE_AT_FINISH,
336                 "Whether to pause when the crawl appears finished, rather "
337                 + "than immediately end the crawl. This gives the operator an "
338                 + "opportunity to view crawl results, and possibly add URIs or "
339                 + "adjust settings, while the crawl state is still available. "
340                 + "Default is false.", DEFAULT_PAUSE_AT_FINISH));
341         t.setOverrideable(false);
342         
343         t = addElementToDefinition(new SimpleType(
344                 ATTR_SOURCE_TAG_SEEDS,
345                 "Whether to tag seeds with their own URI as a heritable " +
346                 "'source' String, which will be carried-forward to all URIs " +
347                 "discovered on paths originating from that seed. When " +
348                 "present, such source tags appear in the second-to-last " +
349                 "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS));
350         t.setOverrideable(false);
351         
352         t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
353                 "Set to false to disable recovery log writing.  Do this if " +
354                 "you you are using the checkpoint feature for recovering " +
355                 "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED));
356         t.setExpertSetting(true);
357         // No sense in it being overrideable.
358         t.setOverrideable(false);
359     }
360 
361     public void start() {
362         if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START))
363                 .booleanValue()) {
364             // trigger crawl-wide pause
365             controller.requestCrawlPause();
366         } else {
367             // simply begin
368             unpause(); 
369         }
370     }
371     
372     synchronized public void pause() {
373         shouldPause = true;
374     }
375 
376     synchronized public void unpause() {
377         shouldPause = false;
378         notifyAll();
379     }
380 
381     public void initialize(CrawlController c)
382             throws FatalConfigurationException, IOException {
383         c.addCrawlStatusListener(this);
384         File logsDisk = null;
385         try {
386             logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
387         } catch (AttributeNotFoundException e) {
388             logger.log(Level.SEVERE, "Failed to get logs directory", e);
389         }
390         if (logsDisk != null) {
391             String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
392             if (((Boolean)getUncheckedAttribute(null, ATTR_RECOVERY_ENABLED))
393                     .booleanValue()) {
394                 this.recover = new RecoveryJournal(logsPath,
395                     FrontierJournal.LOGNAME_RECOVER);
396             }
397         }
398 //        try {
399 //            final Class qapClass = Class.forName((String)getUncheckedAttribute(
400 //                    null, ATTR_QUEUE_ASSIGNMENT_POLICY));
401 //
402 //            queueAssignmentPolicy =
403 //                (QueueAssignmentPolicy)qapClass.newInstance();
404 //        } catch (Exception e) {
405 //            logger.log(Level.SEVERE, "Bad queue assignment policy class", e);
406 //            throw new FatalConfigurationException(e.getMessage());
407 //        }
408     }
409 
410     synchronized public void terminate() {
411         shouldTerminate = true;
412         if (this.recover != null) {
413             this.recover.close();
414             this.recover = null;
415         }
416         unpause();
417     }
418 
419     /***
420      * Report CrawlURI to each of the three 'substats' accumulators
421      * (group/queue, server, host) for a given stage. 
422      * 
423      * @param curi
424      * @param stage
425      */
426     protected void tally(CrawlURI curi, Stage stage) {
427         // Tally per-server, per-host, per-frontier-class running totals
428         CrawlServer server =
429             controller.getServerCache().getServerFor(curi);
430         if (server != null) {
431             server.getSubstats().tally(curi, stage);
432         }
433         CrawlHost host = 
434             controller.getServerCache().getHostFor(curi);
435         if (host != null) {
436             host.getSubstats().tally(curi, stage);
437         } 
438         FrontierGroup group = 
439             controller.getFrontier().getGroup(curi);
440         group.getSubstats().tally(curi, stage);
441     }
442     
443     protected void doJournalFinishedSuccess(CrawlURI c) {
444         tally(c,CrawlSubstats.Stage.SUCCEEDED);
445         if (this.recover != null) {
446             this.recover.finishedSuccess(c);
447         }
448     }
449 
450     protected void doJournalAdded(CrawlURI c) {
451         tally(c,CrawlSubstats.Stage.SCHEDULED);
452         if (this.recover != null) {
453             this.recover.added(c);
454         }
455     }
456 
457     protected void doJournalRescheduled(CrawlURI c) {
458         tally(c,CrawlSubstats.Stage.RETRIED);
459         if (this.recover != null) {
460             this.recover.rescheduled(c);
461         }
462     }
463 
464     protected void doJournalFinishedFailure(CrawlURI c) {
465         tally(c,CrawlSubstats.Stage.FAILED);
466         if (this.recover != null) {
467             this.recover.finishedFailure(c);
468         }
469     }
470     
471     protected void doJournalDisregarded(CrawlURI c) {
472         tally(c,CrawlSubstats.Stage.DISREGARDED);
473         if (this.recover != null) {
474             this.recover.finishedDisregard(c);
475         }
476     }
477 
478     protected void doJournalEmitted(CrawlURI c) {
479         if (this.recover != null) {
480             this.recover.emitted(c);
481         }
482     }
483 
484     /***
485      * Frontier is empty only if all queues are empty and no URIs are in-process
486      * 
487      * @return True if queues are empty.
488      */
489     public boolean isEmpty() {
490         return liveQueuedUriCount.get() == 0;
491     }
492 
493     /***
494      * Increment the running count of queued URIs. 
495      */
496     protected void incrementQueuedUriCount() {
497         liveQueuedUriCount.incrementAndGet();
498     }
499 
500     /***
501      * Increment the running count of queued URIs. Synchronized because
502      * operations on longs are not atomic.
503      * 
504      * @param increment
505      *            amount to increment the queued count
506      */
507     protected void incrementQueuedUriCount(long increment) {
508         liveQueuedUriCount.addAndGet(increment);
509     }
510 
511     /***
512      * Note that a number of queued Uris have been deleted.
513      * 
514      * @param numberOfDeletes
515      */
516     protected void decrementQueuedCount(long numberOfDeletes) {
517         liveQueuedUriCount.addAndGet(-numberOfDeletes);
518     }
519 
520     /***
521      * (non-Javadoc)
522      * 
523      * @see org.archive.crawler.framework.Frontier#queuedUriCount()
524      */
525     public long queuedUriCount() {
526         return liveQueuedUriCount.get();
527     }
528 
529     /***
530      * (non-Javadoc)
531      * 
532      * @see org.archive.crawler.framework.Frontier#finishedUriCount()
533      */
534     public long finishedUriCount() {
535         return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();
536     }
537 
538     /***
539      * Increment the running count of successfully fetched URIs. 
540      */
541     protected void incrementSucceededFetchCount() {
542         liveSucceededFetchCount.incrementAndGet();
543     }
544 
545     /***
546      * (non-Javadoc)
547      * 
548      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
549      */
550     public long succeededFetchCount() {
551         return liveSucceededFetchCount.get();
552     }
553 
554     /***
555      * Increment the running count of failed URIs. 
556      */
557     protected void incrementFailedFetchCount() {
558         liveFailedFetchCount.incrementAndGet();
559     }
560 
561     /***
562      * (non-Javadoc)
563      * 
564      * @see org.archive.crawler.framework.Frontier#failedFetchCount()
565      */
566     public long failedFetchCount() {
567         return liveFailedFetchCount.get();
568     }
569 
570     /***
571      * Increment the running count of disregarded URIs. Synchronized because
572      * operations on longs are not atomic.
573      */
574     protected void incrementDisregardedUriCount() {
575         liveDisregardedUriCount.incrementAndGet();
576     }
577 
578     public long disregardedUriCount() {
579         return liveDisregardedUriCount.get();
580     }
581 
582     /*** @deprecated misnomer; use StatisticsTracking figures instead */
583     public long totalBytesWritten() {
584         return totalProcessedBytes;
585     }
586 
587     /***
588      * Load up the seeds.
589      * 
590      * This method is called on initialize and inside in the crawlcontroller
591      * when it wants to force reloading of configuration.
592      * 
593      * @see org.archive.crawler.framework.CrawlController#kickUpdate()
594      */
595     public void loadSeeds() {
596         Writer ignoredWriter = new StringWriter();
597         logger.info("beginning");
598         // Get the seeds to refresh.
599         Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
600         int count = 0; 
601         while (iter.hasNext()) {
602             UURI u = (UURI)iter.next();
603             CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);
604             caUri.setSchedulingDirective(CandidateURI.MEDIUM);
605             if (((Boolean)getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS))
606                     .booleanValue()) {
607                 caUri.putString(CoreAttributeConstants.A_SOURCE_TAG,caUri.toString());
608                 caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);
609             }
610             schedule(caUri);
611             count++;
612             if(count%1000==0) {
613                 logger.info(count+" seeds");
614             }
615         }
616         // save ignored items (if any) where they can be consulted later
617         saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
618         logger.info("finished");
619     }
620 
621     /***
622      * Dump ignored seed items (if any) to disk; delete file otherwise.
623      * Static to allow non-derived sibling classes (frontiers not yet 
624      * subclassed here) to reuse.
625      * 
626      * @param ignoredItems
627      * @param dir 
628      */
629     public static void saveIgnoredItems(String ignoredItems, File dir) {
630         File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);
631         if(ignoredItems==null | ignoredItems.length()>0) {
632             try {
633                 BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));
634                 bw.write(ignoredItems);
635                 bw.close();
636             } catch (IOException e) {
637                 // TODO make an alert?
638                 e.printStackTrace();
639             }
640         } else {
641             // delete any older file (if any)
642             ignoredFile.delete();
643         }
644     }
645 
646     protected CrawlURI asCrawlUri(CandidateURI caUri) {
647         CrawlURI curi;
648         if (caUri instanceof CrawlURI) {
649             curi = (CrawlURI)caUri;
650         } else {
651             curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());
652         }
653         curi.setClassKey(getClassKey(curi));
654         return curi;
655     }
656 
657     /***
658      * @param now
659      * @throws InterruptedException
660      * @throws EndedException
661      */
662     protected synchronized void preNext(long now) throws InterruptedException,
663             EndedException {
664         if (this.controller == null) {
665             return;
666         }
667         
668         // Check completion conditions
669         if (this.controller.atFinish()) {
670             if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH))
671                     .booleanValue()) {
672                 this.controller.requestCrawlPause();
673             } else {
674                 this.controller.beginCrawlStop();
675             }
676         }
677 
678         // enforce operator pause
679         if (shouldPause) {
680             while (shouldPause) {
681                 this.controller.toePaused();
682                 wait();
683             }
684             // exitted pause; possibly finish regardless of pause-at-finish
685             if (controller != null && controller.atFinish()) {
686                 this.controller.beginCrawlStop();
687             }
688         }
689 
690         // enforce operator terminate or thread retirement
691         if (shouldTerminate
692                 || ((ToeThread)Thread.currentThread()).shouldRetire()) {
693             throw new EndedException("terminated");
694         }
695 
696         enforceBandwidthThrottle(now);
697     }
698 
699     /***
700      * Perform any special handling of the CrawlURI, such as promoting its URI
701      * to seed-status, or preferencing it because it is an embed.
702      * 
703      * @param curi
704      */
705     protected void applySpecialHandling(CrawlURI curi) {
706         if (curi.isSeed() && curi.getVia() != null
707                 && curi.flattenVia().length() > 0) {
708             // The only way a seed can have a non-empty via is if it is the
709             // result of a seed redirect. Add it to the seeds list.
710             //
711             // This is a feature. This is handling for case where a seed
712             // gets immediately redirected to another page. What we're doing is
713             // treating the immediate redirect target as a seed.
714             this.controller.getScope().addSeed(curi);
715             // And it needs rapid scheduling.
716 	    if (curi.getSchedulingDirective() == CandidateURI.NORMAL)
717                 curi.setSchedulingDirective(CandidateURI.MEDIUM);
718         }
719 
720         // optionally preferencing embeds up to MEDIUM
721         int prefHops = ((Integer)getUncheckedAttribute(curi,
722                 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
723         if (prefHops > 0) {
724             int embedHops = curi.getTransHops();
725             if (embedHops > 0 && embedHops <= prefHops
726                     && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
727                 // number of embed hops falls within the preferenced range, and
728                 // uri is not already MEDIUM -- so promote it
729                 curi.setSchedulingDirective(CandidateURI.MEDIUM);
730             }
731         }
732     }
733 
734     /***
735      * Perform fixups on a CrawlURI about to be returned via next().
736      * 
737      * @param curi
738      *            CrawlURI about to be returned by next()
739      * @param q
740      *            the queue from which the CrawlURI came
741      */
742     protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {
743         curi.setHolder(q);
744         // if (curi.getServer() == null) {
745         //    // TODO: perhaps short-circuit the emit here,
746         //    // because URI will be rejected as unfetchable
747         // }
748         doJournalEmitted(curi);
749     }
750 
751     /***
752      * @param curi
753      * @return the CrawlServer to be associated with this CrawlURI
754      */
755     protected CrawlServer getServer(CrawlURI curi) {
756         return this.controller.getServerCache().getServerFor(curi);
757     }
758 
759     /***
760      * Return a suitable value to wait before retrying the given URI.
761      * 
762      * @param curi
763      *            CrawlURI to be retried
764      * @return millisecond delay before retry
765      */
766     protected long retryDelayFor(CrawlURI curi) {
767         int status = curi.getFetchStatus();
768         return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
769                 status == S_DOMAIN_UNRESOLVABLE)?
770             ((Long)getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue():
771             0; // no delay for most
772     }
773 
774     /***
775      * Update any scheduling structures with the new information in this
776      * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
777      * the same host to be visited within the appropriate politeness window.
778      * 
779      * @param curi
780      *            The CrawlURI
781      * @return millisecond politeness delay
782      */
783     protected long politenessDelayFor(CrawlURI curi) {
784         long durationToWait = 0;
785         if (curi.containsKey(A_FETCH_BEGAN_TIME)
786                 && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
787 
788             long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
789             long durationTaken = (completeTime - curi
790                     .getLong(A_FETCH_BEGAN_TIME));
791             durationToWait = (long)(((Float)getUncheckedAttribute(curi,
792                     ATTR_DELAY_FACTOR)).floatValue() * durationTaken);
793 
794             long minDelay = ((Integer)getUncheckedAttribute(curi,
795                     ATTR_MIN_DELAY)).longValue();
796             
797             if (minDelay > durationToWait) {
798                 // wait at least the minimum
799                 durationToWait = minDelay;
800             }
801 
802             long maxDelay = ((Integer)getUncheckedAttribute(curi,
803                     ATTR_MAX_DELAY)).longValue();
804             if (durationToWait > maxDelay) {
805                 // wait no more than the maximum
806                 durationToWait = maxDelay;
807             }
808 
809             long respectThreshold = ((Integer)getUncheckedAttribute(curi,
810                     ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS)).longValue()*1000;
811             
812             if(durationToWait<respectThreshold) {
813                 // may need to extend wait
814                 CrawlServer s = controller.getServerCache().getServerFor(curi);
815                 String ua = curi.getUserAgent();
816                 if(ua==null) {
817                     ua = controller.getOrder().getUserAgent(curi);
818                 }
819                 RobotsExclusionPolicy rep = s.getRobots(); 
820                 if (rep!=null) {
821                     long crawlDelay = (long)(1000 * s.getRobots().getCrawlDelay(ua));
822                     crawlDelay = 
823                         (crawlDelay > respectThreshold) 
824                             ? respectThreshold 
825                             : crawlDelay; 
826                     if (crawlDelay > durationToWait) {
827                         // wait at least the directive crawl-delay
828                         durationToWait = crawlDelay;
829                     }
830                 }
831             }
832             
833             long now = System.currentTimeMillis();
834             int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi,
835                     ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();
836             if (maxBandwidthKB > 0) {
837                 // Enforce bandwidth limit
838                 CrawlHost host = controller.getServerCache().getHostFor(curi);
839                 long minDurationToWait = host.getEarliestNextURIEmitTime()
840                         - now;
841                 float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
842                 long processedBytes = curi.getContentSize();
843                 host
844                         .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
845                                 + now);
846 
847                 if (minDurationToWait > durationToWait) {
848                     durationToWait = minDurationToWait;
849                 }
850             }
851         }
852         return durationToWait;
853     }
854 
855     /***
856      * Ensure that any overall-bandwidth-usage limit is respected, by pausing as
857      * long as necessary.
858      * 
859      * @param now
860      * @throws InterruptedException
861      */
862     private void enforceBandwidthThrottle(long now) throws InterruptedException {
863         int maxBandwidthKB = ((Integer)getUncheckedAttribute(null,
864                 ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();
865         if (maxBandwidthKB > 0) {
866             // Make sure that new bandwidth setting doesn't affect total crawl
867             if (maxBandwidthKB != lastMaxBandwidthKB) {
868                 lastMaxBandwidthKB = maxBandwidthKB;
869                 processedBytesAfterLastEmittedURI = totalProcessedBytes;
870             }
871 
872             // Enforce bandwidth limit
873             long sleepTime = nextURIEmitTime - now;
874             float maxBandwidth = maxBandwidthKB * 1.024F; // Kilo_factor
875             long processedBytes = totalProcessedBytes
876                     - processedBytesAfterLastEmittedURI;
877             long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0
878                     : nextURIEmitTime - now;
879             nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now
880                     + shouldHaveEmittedDiff;
881             processedBytesAfterLastEmittedURI = totalProcessedBytes;
882             if (sleepTime > 0) {
883                 long targetTime = now + sleepTime;
884                 now = System.currentTimeMillis();
885                 while (now < targetTime) {
886                     synchronized (this) {
887                         if (logger.isLoggable(Level.FINE)) {
888                             logger.fine("Frontier waits for: " + sleepTime
889                                     + "ms to respect bandwidth limit.");
890                         }
891                         // TODO: now that this is a wait(), frontier can
892                         // still schedule and finish items while waiting,
893                         // which is good, but multiple threads could all
894                         // wait for the same wakeTime, which somewhat
895                         // spoils the throttle... should be fixed.
896                         wait(targetTime - now);
897                     }
898                     now = System.currentTimeMillis();
899                 }
900             }
901         }
902     }
903 
904     /***
905      * Take note of any processor-local errors that have been entered into the
906      * CrawlURI.
907      * 
908      * @param curi
909      *  
910      */
911     protected void logLocalizedErrors(CrawlURI curi) {
912         if (curi.containsKey(A_LOCALIZED_ERRORS)) {
913             List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);
914             Iterator iter = localErrors.iterator();
915             while (iter.hasNext()) {
916                 Object array[] = {curi, iter.next()};
917                 controller.localErrors.log(Level.WARNING, curi.getUURI()
918                         .toString(), array);
919             }
920             // once logged, discard
921             curi.remove(A_LOCALIZED_ERRORS);
922         }
923     }
924 
925     /***
926      * Utility method to return a scratch dir for the given key's temp files.
927      * Every key gets its own subdir. To avoid having any one directory with
928      * thousands of files, there are also two levels of enclosing directory
929      * named by the least-significant hex digits of the key string's java
930      * hashcode.
931      * 
932      * @param key
933      * @return File representing scratch directory
934      */
935     protected File scratchDirFor(String key) {
936         String hex = Integer.toHexString(key.hashCode());
937         while (hex.length() < 4) {
938             hex = "0" + hex;
939         }
940         int len = hex.length();
941         return new File(this.controller.getStateDisk(), hex.substring(len - 2,
942                 len)
943                 + File.separator
944                 + hex.substring(len - 4, len - 2)
945                 + File.separator + key);
946     }
947 
948     protected boolean overMaxRetries(CrawlURI curi) {
949         // never retry more than the max number of times
950         if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi,
951                 ATTR_MAX_RETRIES)).intValue()) {
952             return true;
953         }
954         return false;
955     }
956 
957     public void importRecoverLog(String pathToLog, boolean retainFailures)
958             throws IOException {
959         File source = new File(pathToLog);
960         if (!source.isAbsolute()) {
961             source = new File(getSettingsHandler().getOrder().getController()
962                     .getDisk(), pathToLog);
963         }
964         RecoveryJournal.importRecoverLog(source, controller, retainFailures);
965     }
966 
967     /*
968      * (non-Javadoc)
969      * 
970      * @see org.archive.crawler.framework.URIFrontier#kickUpdate()
971      */
972     public void kickUpdate() {
973         // by default, do nothing
974         // (scope will loadSeeds, if appropriate)
975     }
976 
977     /***
978      * Log to the main crawl.log
979      * 
980      * @param curi
981      */
982     protected void log(CrawlURI curi) {
983         curi.aboutToLog();
984         Object array[] = {curi};
985         this.controller.uriProcessing.log(Level.INFO,
986                 curi.getUURI().toString(), array);
987     }
988 
989     protected boolean isDisregarded(CrawlURI curi) {
990         switch (curi.getFetchStatus()) {
991         case S_ROBOTS_PRECLUDED: // they don't want us to have it
992         case S_BLOCKED_BY_CUSTOM_PROCESSOR:
993         case S_OUT_OF_SCOPE: // filtered out by scope
994         case S_BLOCKED_BY_USER: // filtered out by user
995         case S_TOO_MANY_EMBED_HOPS: // too far from last true link
996         case S_TOO_MANY_LINK_HOPS: // too far from seeds
997         case S_DELETED_BY_USER: // user deleted
998             return true;
999         default:
1000             return false;
1001         }
1002     }
1003 
1004     /***
1005      * Checks if a recently completed CrawlURI that did not finish successfully
1006      * needs to be retried (processed again after some time elapses)
1007      * 
1008      * @param curi
1009      *            The CrawlURI to check
1010      * @return True if we need to retry.
1011      */
1012     protected boolean needsRetrying(CrawlURI curi) {
1013         if (overMaxRetries(curi)) {
1014             return false;
1015         }
1016 
1017         switch (curi.getFetchStatus()) {
1018         case HttpStatus.SC_UNAUTHORIZED:
1019             // We can get here though usually a positive status code is
1020             // a success. We get here if there is rfc2617 credential data
1021             // loaded and we're supposed to go around again. See if any
1022             // rfc2617 credential present and if there, assume it got
1023             // loaded in FetchHTTP on expectation that we're to go around
1024             // again. If no rfc2617 loaded, we should not be here.
1025             boolean loaded = curi.hasRfc2617CredentialAvatar();
1026             if (!loaded && logger.isLoggable(Level.INFO)) {
1027                 logger.info("Have 401 but no creds loaded " + curi);
1028             }
1029             return loaded;
1030         case S_DEFERRED:
1031         case S_CONNECT_FAILED:
1032         case S_CONNECT_LOST:
1033         case S_DOMAIN_UNRESOLVABLE:
1034             // these are all worth a retry
1035             // TODO: consider if any others (S_TIMEOUT in some cases?) deserve
1036             // retry
1037             return true;
1038         default:
1039             return false;
1040         }
1041     }
1042 
1043     /***
1044      * Canonicalize passed uuri. Its would be sweeter if this canonicalize
1045      * function was encapsulated by that which it canonicalizes but because
1046      * settings change with context -- i.e. there may be overrides in operation
1047      * for a particular URI -- its not so easy; Each CandidateURI would need a
1048      * reference to the settings system. That's awkward to pass in.
1049      * 
1050      * @param uuri Candidate URI to canonicalize.
1051      * @return Canonicalized version of passed <code>uuri</code>.
1052      */
1053     protected String canonicalize(UURI uuri) {
1054         return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
1055     }
1056 
1057     /***
1058      * Canonicalize passed CandidateURI. This method differs from
1059      * {@link #canonicalize(UURI)} in that it takes a look at
1060      * the CandidateURI context possibly overriding any canonicalization effect if
1061      * it could make us miss content. If canonicalization produces an URL that
1062      * was 'alreadyseen', but the entry in the 'alreadyseen' database did
1063      * nothing but redirect to the current URL, we won't get the current URL;
1064      * we'll think we've already see it. Examples would be archive.org
1065      * redirecting to www.archive.org or the inverse, www.netarkivet.net
1066      * redirecting to netarkivet.net (assuming stripWWW rule enabled).
1067      * <p>Note, this method under circumstance sets the forceFetch flag.
1068      * 
1069      * @param cauri CandidateURI to examine.
1070      * @return Canonicalized <code>cacuri</code>.
1071      */
1072     protected String canonicalize(CandidateURI cauri) {
1073         String canon = canonicalize(cauri.getUURI());
1074         if (cauri.isLocation()) {
1075             // If the via is not the same as where we're being redirected (i.e.
1076             // we're not being redirected back to the same page, AND the
1077             // canonicalization of the via is equal to the the current cauri, 
1078             // THEN forcefetch (Forcefetch so no chance of our not crawling
1079             // content because alreadyseen check things its seen the url before.
1080             // An example of an URL that redirects to itself is:
1081             // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
1082             // An example of an URL whose canonicalization equals its via's
1083             // canonicalization, and we want to fetch content at the
1084             // redirection (i.e. need to set forcefetch), is netarkivet.dk.
1085             if (!cauri.toString().equals(cauri.getVia().toString()) &&
1086                     canonicalize(cauri.getVia()).equals(canon)) {
1087                 cauri.setForceFetch(true);
1088             }
1089         }
1090         return canon;
1091     }
1092 
1093     /***
1094      * @param cauri CrawlURI we're to get a key for.
1095      * @return a String token representing a queue
1096      */
1097     public String getClassKey(CandidateURI cauri) {
1098         String queueKey = (String)getUncheckedAttribute(cauri,
1099             ATTR_FORCE_QUEUE);
1100         if ("".equals(queueKey)) {
1101             // no forced override
1102             QueueAssignmentPolicy queueAssignmentPolicy = 
1103                 getQueueAssignmentPolicy(cauri);
1104             queueKey =
1105                 queueAssignmentPolicy.getClassKey(this.controller, cauri);
1106         }
1107         return queueKey;
1108     }
1109 
1110     protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri) {
1111         String clsName = (String)getUncheckedAttribute(cauri,
1112                 ATTR_QUEUE_ASSIGNMENT_POLICY);
1113         try {
1114             return (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
1115         } catch (Exception e) {
1116             throw new RuntimeException(e);
1117         }
1118     }
1119 
1120     /***
1121      * @return RecoveryJournal instance.  May be null.
1122      */
1123     public FrontierJournal getFrontierJournal() {
1124         return this.recover;
1125     }
1126 
1127     public void crawlEnding(String sExitMessage) {
1128         // TODO Auto-generated method stub
1129     }
1130 
1131     public void crawlEnded(String sExitMessage) {
1132         if (logger.isLoggable(Level.INFO)) {
1133             logger.info("Closing with " + Long.toString(queuedUriCount()) +
1134                 " urls still in queue.");
1135         }
1136     }
1137 
1138     public void crawlStarted(String message) {
1139         // TODO Auto-generated method stub
1140     }
1141 
1142     public void crawlPausing(String statusMessage) {
1143         // TODO Auto-generated method stub
1144     }
1145 
1146     public void crawlPaused(String statusMessage) {
1147         // TODO Auto-generated method stub
1148     }
1149 
1150     public void crawlResuming(String statusMessage) {
1151         // TODO Auto-generated method stub
1152     }
1153     
1154     public void crawlCheckpoint(File checkpointDir)
1155     throws Exception {
1156         if (this.recover == null) {
1157             return;
1158         }
1159         this.recover.checkpoint(checkpointDir);
1160     }
1161     
1162     //
1163     // Reporter implementation
1164     // 
1165     public String singleLineReport() {
1166         return ArchiveUtils.singleLineReport(this);
1167     }
1168 
1169     public void reportTo(PrintWriter writer) {
1170         reportTo(null, writer);
1171     }
1172     
1173     //
1174     // maintain serialization compatibility to pre-AtomicLong impl
1175     private void writeObject(java.io.ObjectOutputStream out)
1176     throws IOException {
1177         queuedUriCount = liveQueuedUriCount.get();
1178         succeededFetchCount = liveSucceededFetchCount.get();
1179         failedFetchCount = liveFailedFetchCount.get();
1180         disregardedUriCount = liveDisregardedUriCount.get();
1181         out.defaultWriteObject();
1182     }
1183     private void readObject(java.io.ObjectInputStream in)
1184     throws IOException, ClassNotFoundException {
1185         in.defaultReadObject();
1186         liveQueuedUriCount = new AtomicLong(queuedUriCount);
1187         liveSucceededFetchCount = new AtomicLong(succeededFetchCount);
1188         liveFailedFetchCount = new AtomicLong(failedFetchCount);
1189         liveDisregardedUriCount = new AtomicLong(disregardedUriCount);
1190     }
1191 }