1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import java.io.BufferedWriter;
28 import java.io.File;
29 import java.io.FileWriter;
30 import java.io.IOException;
31 import java.io.PrintWriter;
32 import java.io.Serializable;
33 import java.io.StringWriter;
34 import java.io.Writer;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.concurrent.atomic.AtomicLong;
38 import java.util.logging.Level;
39 import java.util.logging.Logger;
40 import java.util.regex.Pattern;
41
42 import javax.management.AttributeNotFoundException;
43
44 import org.apache.commons.httpclient.HttpStatus;
45 import org.archive.crawler.datamodel.CandidateURI;
46 import org.archive.crawler.datamodel.CoreAttributeConstants;
47 import org.archive.crawler.datamodel.CrawlHost;
48 import org.archive.crawler.datamodel.CrawlOrder;
49 import org.archive.crawler.datamodel.CrawlServer;
50 import org.archive.crawler.datamodel.CrawlSubstats;
51 import org.archive.crawler.datamodel.CrawlURI;
52 import org.archive.crawler.datamodel.FetchStatusCodes;
53 import org.archive.crawler.datamodel.RobotsExclusionPolicy;
54 import org.archive.crawler.datamodel.CrawlSubstats.Stage;
55 import org.archive.crawler.event.CrawlStatusListener;
56 import org.archive.crawler.framework.CrawlController;
57 import org.archive.crawler.framework.Frontier;
58 import org.archive.crawler.framework.ToeThread;
59 import org.archive.crawler.framework.exceptions.EndedException;
60 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
61 import org.archive.crawler.settings.ModuleType;
62 import org.archive.crawler.settings.RegularExpressionConstraint;
63 import org.archive.crawler.settings.SimpleType;
64 import org.archive.crawler.settings.Type;
65 import org.archive.crawler.url.Canonicalizer;
66 import org.archive.net.UURI;
67 import org.archive.util.ArchiveUtils;
68
69 /***
70 * Shared facilities for Frontier implementations.
71 *
72 * @author gojomo
73 */
74 public abstract class AbstractFrontier extends ModuleType
75 implements CrawlStatusListener, Frontier, FetchStatusCodes,
76 CoreAttributeConstants, Serializable {
77 private static final long serialVersionUID = -4766504935003203930L;
78
79 private static final Logger logger = Logger
80 .getLogger(AbstractFrontier.class.getName());
81
82 protected transient CrawlController controller;
83
84 /*** ordinal numbers to assign to created CrawlURIs */
85 protected AtomicLong nextOrdinal = new AtomicLong(1);
86
87 /*** should the frontier hold any threads asking for URIs? */
88 protected boolean shouldPause = false;
89
90 /***
91 * should the frontier send an EndedException to any threads asking for
92 * URIs?
93 */
94 protected transient boolean shouldTerminate = false;
95
96 /***
97 * how many multiples of last fetch elapsed time to wait before recontacting
98 * same server
99 */
100 public final static String ATTR_DELAY_FACTOR = "delay-factor";
101
102 protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);
103
104 /***
105 * always wait this long after one completion before recontacting same
106 * server, regardless of multiple
107 */
108 public final static String ATTR_MIN_DELAY = "min-delay-ms";
109
110
111 protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);
112
113 /***
114 * Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
115 * robots.txt
116 */
117 public final static String
118 ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS = "respect-crawl-delay-up-to-secs";
119
120
121 protected final static Integer
122 DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS = 300;
123
124 /*** never wait more than this long, regardless of multiple */
125 public final static String ATTR_MAX_DELAY = "max-delay-ms";
126
127
128 protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
129
130 /*** number of hops of embeds (ERX) to bump to front of host queue */
131 public final static String ATTR_PREFERENCE_EMBED_HOPS =
132 "preference-embed-hops";
133
134 protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS =
135 new Integer(1);
136
137 /*** maximum per-host bandwidth usage */
138 public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE =
139 "max-per-host-bandwidth-usage-KB-sec";
140
141 protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE =
142 new Integer(0);
143
144 /*** maximum overall bandwidth usage */
145 public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE =
146 "total-bandwidth-usage-KB-sec";
147
148 protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE =
149 new Integer(0);
150
151 /*** for retryable problems, seconds to wait before a retry */
152 public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
153
154
155 protected final static Long DEFAULT_RETRY_DELAY = new Long(900);
156
157 /*** maximum times to emit a CrawlURI without final disposition */
158 public final static String ATTR_MAX_RETRIES = "max-retries";
159
160 protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
161
162 public final static String ATTR_QUEUE_ASSIGNMENT_POLICY =
163 "queue-assignment-policy";
164
165 /*** queue assignment to force onto CrawlURIs; intended to be overridden */
166 public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
167
168 protected final static String DEFAULT_FORCE_QUEUE = "";
169
170
171 protected final static String ACCEPTABLE_FORCE_QUEUE = "[-//w//.,:]*";
172
173 /*** whether pause, rather than finish, when crawl appears done */
174 public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";
175
176 protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;
177
178 /*** whether to pause at crawl start */
179 public final static String ATTR_PAUSE_AT_START = "pause-at-start";
180 protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;
181
182 /*** whether to pause at crawl start */
183 public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";
184 protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;
185
186 /***
187 * Recover log on or off attribute.
188 */
189 protected final static String ATTR_RECOVERY_ENABLED =
190 "recovery-log-enabled";
191 protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED =
192 Boolean.TRUE;
193
194
195 protected long queuedUriCount;
196 protected long succeededFetchCount;
197 protected long failedFetchCount;
198 protected long disregardedUriCount;
199
200
201 /*** total URIs queued to be visited */
202 transient protected AtomicLong liveQueuedUriCount = new AtomicLong(0);
203
204 transient protected AtomicLong liveSucceededFetchCount = new AtomicLong(0);
205
206 transient protected AtomicLong liveFailedFetchCount = new AtomicLong(0);
207
208 /*** URIs that are disregarded (for example because of robot.txt rules */
209 transient protected AtomicLong liveDisregardedUriCount = new AtomicLong(0);
210
211 /***
212 * Used when bandwidth constraint are used.
213 */
214 protected long totalProcessedBytes = 0;
215
216 private transient long nextURIEmitTime = 0;
217
218 protected long processedBytesAfterLastEmittedURI = 0;
219
220 protected int lastMaxBandwidthKB = 0;
221
222 /***
223 * Crawl replay logger.
224 *
225 * Currently captures Frontier/URI transitions.
226 * Can be null if user chose not to run a recovery.log.
227 */
228 private transient FrontierJournal recover = null;
229
230 /*** file collecting report of ignored seed-file entries (if any) */
231 public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";
232
233 /***
234 * @param name Name of this frontier.
235 * @param description Description for this frontier.
236 */
237 public AbstractFrontier(String name, String description) {
238 super(name, description);
239 addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
240 "How many multiples of last fetch elapsed time to wait before "
241 + "recontacting same server", DEFAULT_DELAY_FACTOR));
242 addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
243 "Never wait more than this long.", DEFAULT_MAX_DELAY));
244 addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
245 "Always wait this long after one completion before recontacting "
246 + "same server.", DEFAULT_MIN_DELAY));
247 addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS,
248 "Respect a Crawl-Delay directive in a site's robots.txt "
249 +"up to this value in seconds. (If longer, simply "
250 +"respect this value.) Default is 300 seconds (5 minutes).",
251 DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS));
252 addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
253 "How often to retry fetching a URI that failed to be retrieved. "
254 + "If zero, the crawler will get the robots.txt only.",
255 DEFAULT_MAX_RETRIES));
256 addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
257 "How long to wait by default until we retry fetching a"
258 + " URI that failed to be retrieved (seconds). ",
259 DEFAULT_RETRY_DELAY));
260 addElementToDefinition(new SimpleType(
261 ATTR_PREFERENCE_EMBED_HOPS,
262 "Number of embedded (or redirected) hops up to which "
263 + "a URI has higher priority scheduling. For example, if set "
264 + "to 1 (the default), items such as inline images (1-hop "
265 + "embedded resources) will be scheduled ahead of all regular "
266 + "links (or many-hop resources, like nested frames). If set to "
267 + "zero, no preferencing will occur, and embeds/redirects are "
268 + "scheduled the same as regular links.",
269 DEFAULT_PREFERENCE_EMBED_HOPS));
270 Type t;
271 t = addElementToDefinition(new SimpleType(
272 ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
273 "The maximum average bandwidth the crawler is allowed to use. "
274 + "The actual read speed is not affected by this setting, it only "
275 + "holds back new URIs from being processed when the bandwidth "
276 + "usage has been to high. 0 means no bandwidth limitation.",
277 DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
278 t.setOverrideable(false);
279 t = addElementToDefinition(new SimpleType(
280 ATTR_MAX_HOST_BANDWIDTH_USAGE,
281 "The maximum average bandwidth the crawler is allowed to use per "
282 + "host. The actual read speed is not affected by this setting, "
283 + "it only holds back new URIs from being processed when the "
284 + "bandwidth usage has been to high. 0 means no bandwidth "
285 + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
286 t.setExpertSetting(true);
287
288
289
290 String queueStr = System.getProperty(AbstractFrontier.class.getName() +
291 "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
292 HostnameQueueAssignmentPolicy.class.getName() + " " +
293 IPQueueAssignmentPolicy.class.getName() + " " +
294 BucketQueueAssignmentPolicy.class.getName() + " " +
295 SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +
296 TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
297 Pattern p = Pattern.compile("//s*,//s*|//s+");
298 String [] queues = p.split(queueStr);
299 if (queues.length <= 0) {
300 throw new RuntimeException("Failed parse of " +
301 " assignment queue policy string: " + queueStr);
302 }
303 t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
304 "Defines how to assign URIs to queues. Can assign by host, " +
305 "by ip, and into one of a fixed set of buckets (1k).",
306 queues[0], queues));
307 t.setExpertSetting(true);
308 t.setOverrideable(true);
309
310 t = addElementToDefinition(new SimpleType(
311 ATTR_FORCE_QUEUE,
312 "The queue name into which to force URIs. Should "
313 + "be left blank at global level. Specify a "
314 + "per-domain/per-host override to force URIs into "
315 + "a particular named queue, regardless of the assignment "
316 + "policy in effect (domain or ip-based politeness). "
317 + "This could be used on domains known to all be from "
318 + "the same small set of IPs (eg blogspot, dailykos, etc.) "
319 + "to simulate IP-based politeness, or it could be used if "
320 + "you wanted to enforce politeness over a whole domain, even "
321 + "though the subdomains are split across many IPs.",
322 DEFAULT_FORCE_QUEUE));
323 t.setOverrideable(true);
324 t.setExpertSetting(true);
325 t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
326 Level.WARNING, "This field must contain only alphanumeric "
327 + "characters plus period, dash, comma, colon, or underscore."));
328 t = addElementToDefinition(new SimpleType(
329 ATTR_PAUSE_AT_START,
330 "Whether to pause when the crawl begins, before any URIs " +
331 "are tried. This gives the operator a chance to verify or " +
332 "adjust the crawl before actual work begins. " +
333 "Default is false.", DEFAULT_PAUSE_AT_START));
334 t = addElementToDefinition(new SimpleType(
335 ATTR_PAUSE_AT_FINISH,
336 "Whether to pause when the crawl appears finished, rather "
337 + "than immediately end the crawl. This gives the operator an "
338 + "opportunity to view crawl results, and possibly add URIs or "
339 + "adjust settings, while the crawl state is still available. "
340 + "Default is false.", DEFAULT_PAUSE_AT_FINISH));
341 t.setOverrideable(false);
342
343 t = addElementToDefinition(new SimpleType(
344 ATTR_SOURCE_TAG_SEEDS,
345 "Whether to tag seeds with their own URI as a heritable " +
346 "'source' String, which will be carried-forward to all URIs " +
347 "discovered on paths originating from that seed. When " +
348 "present, such source tags appear in the second-to-last " +
349 "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS));
350 t.setOverrideable(false);
351
352 t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
353 "Set to false to disable recovery log writing. Do this if " +
354 "you you are using the checkpoint feature for recovering " +
355 "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED));
356 t.setExpertSetting(true);
357
358 t.setOverrideable(false);
359 }
360
361 public void start() {
362 if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START))
363 .booleanValue()) {
364
365 controller.requestCrawlPause();
366 } else {
367
368 unpause();
369 }
370 }
371
372 synchronized public void pause() {
373 shouldPause = true;
374 }
375
376 synchronized public void unpause() {
377 shouldPause = false;
378 notifyAll();
379 }
380
381 public void initialize(CrawlController c)
382 throws FatalConfigurationException, IOException {
383 c.addCrawlStatusListener(this);
384 File logsDisk = null;
385 try {
386 logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
387 } catch (AttributeNotFoundException e) {
388 logger.log(Level.SEVERE, "Failed to get logs directory", e);
389 }
390 if (logsDisk != null) {
391 String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
392 if (((Boolean)getUncheckedAttribute(null, ATTR_RECOVERY_ENABLED))
393 .booleanValue()) {
394 this.recover = new RecoveryJournal(logsPath,
395 FrontierJournal.LOGNAME_RECOVER);
396 }
397 }
398
399
400
401
402
403
404
405
406
407
408 }
409
410 synchronized public void terminate() {
411 shouldTerminate = true;
412 if (this.recover != null) {
413 this.recover.close();
414 this.recover = null;
415 }
416 unpause();
417 }
418
419 /***
420 * Report CrawlURI to each of the three 'substats' accumulators
421 * (group/queue, server, host) for a given stage.
422 *
423 * @param curi
424 * @param stage
425 */
426 protected void tally(CrawlURI curi, Stage stage) {
427
428 CrawlServer server =
429 controller.getServerCache().getServerFor(curi);
430 if (server != null) {
431 server.getSubstats().tally(curi, stage);
432 }
433 CrawlHost host =
434 controller.getServerCache().getHostFor(curi);
435 if (host != null) {
436 host.getSubstats().tally(curi, stage);
437 }
438 FrontierGroup group =
439 controller.getFrontier().getGroup(curi);
440 group.getSubstats().tally(curi, stage);
441 }
442
443 protected void doJournalFinishedSuccess(CrawlURI c) {
444 tally(c,CrawlSubstats.Stage.SUCCEEDED);
445 if (this.recover != null) {
446 this.recover.finishedSuccess(c);
447 }
448 }
449
450 protected void doJournalAdded(CrawlURI c) {
451 tally(c,CrawlSubstats.Stage.SCHEDULED);
452 if (this.recover != null) {
453 this.recover.added(c);
454 }
455 }
456
457 protected void doJournalRescheduled(CrawlURI c) {
458 tally(c,CrawlSubstats.Stage.RETRIED);
459 if (this.recover != null) {
460 this.recover.rescheduled(c);
461 }
462 }
463
464 protected void doJournalFinishedFailure(CrawlURI c) {
465 tally(c,CrawlSubstats.Stage.FAILED);
466 if (this.recover != null) {
467 this.recover.finishedFailure(c);
468 }
469 }
470
471 protected void doJournalDisregarded(CrawlURI c) {
472 tally(c,CrawlSubstats.Stage.DISREGARDED);
473 if (this.recover != null) {
474 this.recover.finishedDisregard(c);
475 }
476 }
477
478 protected void doJournalEmitted(CrawlURI c) {
479 if (this.recover != null) {
480 this.recover.emitted(c);
481 }
482 }
483
484 /***
485 * Frontier is empty only if all queues are empty and no URIs are in-process
486 *
487 * @return True if queues are empty.
488 */
489 public boolean isEmpty() {
490 return liveQueuedUriCount.get() == 0;
491 }
492
493 /***
494 * Increment the running count of queued URIs.
495 */
496 protected void incrementQueuedUriCount() {
497 liveQueuedUriCount.incrementAndGet();
498 }
499
500 /***
501 * Increment the running count of queued URIs. Synchronized because
502 * operations on longs are not atomic.
503 *
504 * @param increment
505 * amount to increment the queued count
506 */
507 protected void incrementQueuedUriCount(long increment) {
508 liveQueuedUriCount.addAndGet(increment);
509 }
510
511 /***
512 * Note that a number of queued Uris have been deleted.
513 *
514 * @param numberOfDeletes
515 */
516 protected void decrementQueuedCount(long numberOfDeletes) {
517 liveQueuedUriCount.addAndGet(-numberOfDeletes);
518 }
519
520 /***
521 * (non-Javadoc)
522 *
523 * @see org.archive.crawler.framework.Frontier#queuedUriCount()
524 */
525 public long queuedUriCount() {
526 return liveQueuedUriCount.get();
527 }
528
529 /***
530 * (non-Javadoc)
531 *
532 * @see org.archive.crawler.framework.Frontier#finishedUriCount()
533 */
534 public long finishedUriCount() {
535 return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();
536 }
537
538 /***
539 * Increment the running count of successfully fetched URIs.
540 */
541 protected void incrementSucceededFetchCount() {
542 liveSucceededFetchCount.incrementAndGet();
543 }
544
545 /***
546 * (non-Javadoc)
547 *
548 * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
549 */
550 public long succeededFetchCount() {
551 return liveSucceededFetchCount.get();
552 }
553
554 /***
555 * Increment the running count of failed URIs.
556 */
557 protected void incrementFailedFetchCount() {
558 liveFailedFetchCount.incrementAndGet();
559 }
560
561 /***
562 * (non-Javadoc)
563 *
564 * @see org.archive.crawler.framework.Frontier#failedFetchCount()
565 */
566 public long failedFetchCount() {
567 return liveFailedFetchCount.get();
568 }
569
570 /***
571 * Increment the running count of disregarded URIs. Synchronized because
572 * operations on longs are not atomic.
573 */
574 protected void incrementDisregardedUriCount() {
575 liveDisregardedUriCount.incrementAndGet();
576 }
577
578 public long disregardedUriCount() {
579 return liveDisregardedUriCount.get();
580 }
581
582 /*** @deprecated misnomer; use StatisticsTracking figures instead */
583 public long totalBytesWritten() {
584 return totalProcessedBytes;
585 }
586
587 /***
588 * Load up the seeds.
589 *
590 * This method is called on initialize and inside in the crawlcontroller
591 * when it wants to force reloading of configuration.
592 *
593 * @see org.archive.crawler.framework.CrawlController#kickUpdate()
594 */
595 public void loadSeeds() {
596 Writer ignoredWriter = new StringWriter();
597 logger.info("beginning");
598
599 Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
600 int count = 0;
601 while (iter.hasNext()) {
602 UURI u = (UURI)iter.next();
603 CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);
604 caUri.setSchedulingDirective(CandidateURI.MEDIUM);
605 if (((Boolean)getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS))
606 .booleanValue()) {
607 caUri.putString(CoreAttributeConstants.A_SOURCE_TAG,caUri.toString());
608 caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);
609 }
610 schedule(caUri);
611 count++;
612 if(count%1000==0) {
613 logger.info(count+" seeds");
614 }
615 }
616
617 saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
618 logger.info("finished");
619 }
620
621 /***
622 * Dump ignored seed items (if any) to disk; delete file otherwise.
623 * Static to allow non-derived sibling classes (frontiers not yet
624 * subclassed here) to reuse.
625 *
626 * @param ignoredItems
627 * @param dir
628 */
629 public static void saveIgnoredItems(String ignoredItems, File dir) {
630 File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);
631 if(ignoredItems==null | ignoredItems.length()>0) {
632 try {
633 BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));
634 bw.write(ignoredItems);
635 bw.close();
636 } catch (IOException e) {
637
638 e.printStackTrace();
639 }
640 } else {
641
642 ignoredFile.delete();
643 }
644 }
645
646 protected CrawlURI asCrawlUri(CandidateURI caUri) {
647 CrawlURI curi;
648 if (caUri instanceof CrawlURI) {
649 curi = (CrawlURI)caUri;
650 } else {
651 curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());
652 }
653 curi.setClassKey(getClassKey(curi));
654 return curi;
655 }
656
657 /***
658 * @param now
659 * @throws InterruptedException
660 * @throws EndedException
661 */
662 protected synchronized void preNext(long now) throws InterruptedException,
663 EndedException {
664 if (this.controller == null) {
665 return;
666 }
667
668
669 if (this.controller.atFinish()) {
670 if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH))
671 .booleanValue()) {
672 this.controller.requestCrawlPause();
673 } else {
674 this.controller.beginCrawlStop();
675 }
676 }
677
678
679 if (shouldPause) {
680 while (shouldPause) {
681 this.controller.toePaused();
682 wait();
683 }
684
685 if (controller != null && controller.atFinish()) {
686 this.controller.beginCrawlStop();
687 }
688 }
689
690
691 if (shouldTerminate
692 || ((ToeThread)Thread.currentThread()).shouldRetire()) {
693 throw new EndedException("terminated");
694 }
695
696 enforceBandwidthThrottle(now);
697 }
698
699 /***
700 * Perform any special handling of the CrawlURI, such as promoting its URI
701 * to seed-status, or preferencing it because it is an embed.
702 *
703 * @param curi
704 */
705 protected void applySpecialHandling(CrawlURI curi) {
706 if (curi.isSeed() && curi.getVia() != null
707 && curi.flattenVia().length() > 0) {
708
709
710
711
712
713
714 this.controller.getScope().addSeed(curi);
715
716 if (curi.getSchedulingDirective() == CandidateURI.NORMAL)
717 curi.setSchedulingDirective(CandidateURI.MEDIUM);
718 }
719
720
721 int prefHops = ((Integer)getUncheckedAttribute(curi,
722 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
723 if (prefHops > 0) {
724 int embedHops = curi.getTransHops();
725 if (embedHops > 0 && embedHops <= prefHops
726 && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
727
728
729 curi.setSchedulingDirective(CandidateURI.MEDIUM);
730 }
731 }
732 }
733
734 /***
735 * Perform fixups on a CrawlURI about to be returned via next().
736 *
737 * @param curi
738 * CrawlURI about to be returned by next()
739 * @param q
740 * the queue from which the CrawlURI came
741 */
742 protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {
743 curi.setHolder(q);
744
745
746
747
748 doJournalEmitted(curi);
749 }
750
751 /***
752 * @param curi
753 * @return the CrawlServer to be associated with this CrawlURI
754 */
755 protected CrawlServer getServer(CrawlURI curi) {
756 return this.controller.getServerCache().getServerFor(curi);
757 }
758
759 /***
760 * Return a suitable value to wait before retrying the given URI.
761 *
762 * @param curi
763 * CrawlURI to be retried
764 * @return millisecond delay before retry
765 */
766 protected long retryDelayFor(CrawlURI curi) {
767 int status = curi.getFetchStatus();
768 return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
769 status == S_DOMAIN_UNRESOLVABLE)?
770 ((Long)getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue():
771 0;
772 }
773
774 /***
775 * Update any scheduling structures with the new information in this
776 * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
777 * the same host to be visited within the appropriate politeness window.
778 *
779 * @param curi
780 * The CrawlURI
781 * @return millisecond politeness delay
782 */
783 protected long politenessDelayFor(CrawlURI curi) {
784 long durationToWait = 0;
785 if (curi.containsKey(A_FETCH_BEGAN_TIME)
786 && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
787
788 long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
789 long durationTaken = (completeTime - curi
790 .getLong(A_FETCH_BEGAN_TIME));
791 durationToWait = (long)(((Float)getUncheckedAttribute(curi,
792 ATTR_DELAY_FACTOR)).floatValue() * durationTaken);
793
794 long minDelay = ((Integer)getUncheckedAttribute(curi,
795 ATTR_MIN_DELAY)).longValue();
796
797 if (minDelay > durationToWait) {
798
799 durationToWait = minDelay;
800 }
801
802 long maxDelay = ((Integer)getUncheckedAttribute(curi,
803 ATTR_MAX_DELAY)).longValue();
804 if (durationToWait > maxDelay) {
805
806 durationToWait = maxDelay;
807 }
808
809 long respectThreshold = ((Integer)getUncheckedAttribute(curi,
810 ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS)).longValue()*1000;
811
812 if(durationToWait<respectThreshold) {
813
814 CrawlServer s = controller.getServerCache().getServerFor(curi);
815 String ua = curi.getUserAgent();
816 if(ua==null) {
817 ua = controller.getOrder().getUserAgent(curi);
818 }
819 RobotsExclusionPolicy rep = s.getRobots();
820 if (rep!=null) {
821 long crawlDelay = (long)(1000 * s.getRobots().getCrawlDelay(ua));
822 crawlDelay =
823 (crawlDelay > respectThreshold)
824 ? respectThreshold
825 : crawlDelay;
826 if (crawlDelay > durationToWait) {
827
828 durationToWait = crawlDelay;
829 }
830 }
831 }
832
833 long now = System.currentTimeMillis();
834 int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi,
835 ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();
836 if (maxBandwidthKB > 0) {
837
838 CrawlHost host = controller.getServerCache().getHostFor(curi);
839 long minDurationToWait = host.getEarliestNextURIEmitTime()
840 - now;
841 float maxBandwidth = maxBandwidthKB * 1.024F;
842 long processedBytes = curi.getContentSize();
843 host
844 .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
845 + now);
846
847 if (minDurationToWait > durationToWait) {
848 durationToWait = minDurationToWait;
849 }
850 }
851 }
852 return durationToWait;
853 }
854
855 /***
856 * Ensure that any overall-bandwidth-usage limit is respected, by pausing as
857 * long as necessary.
858 *
859 * @param now
860 * @throws InterruptedException
861 */
862 private void enforceBandwidthThrottle(long now) throws InterruptedException {
863 int maxBandwidthKB = ((Integer)getUncheckedAttribute(null,
864 ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();
865 if (maxBandwidthKB > 0) {
866
867 if (maxBandwidthKB != lastMaxBandwidthKB) {
868 lastMaxBandwidthKB = maxBandwidthKB;
869 processedBytesAfterLastEmittedURI = totalProcessedBytes;
870 }
871
872
873 long sleepTime = nextURIEmitTime - now;
874 float maxBandwidth = maxBandwidthKB * 1.024F;
875 long processedBytes = totalProcessedBytes
876 - processedBytesAfterLastEmittedURI;
877 long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0
878 : nextURIEmitTime - now;
879 nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now
880 + shouldHaveEmittedDiff;
881 processedBytesAfterLastEmittedURI = totalProcessedBytes;
882 if (sleepTime > 0) {
883 long targetTime = now + sleepTime;
884 now = System.currentTimeMillis();
885 while (now < targetTime) {
886 synchronized (this) {
887 if (logger.isLoggable(Level.FINE)) {
888 logger.fine("Frontier waits for: " + sleepTime
889 + "ms to respect bandwidth limit.");
890 }
891
892
893
894
895
896 wait(targetTime - now);
897 }
898 now = System.currentTimeMillis();
899 }
900 }
901 }
902 }
903
904 /***
905 * Take note of any processor-local errors that have been entered into the
906 * CrawlURI.
907 *
908 * @param curi
909 *
910 */
911 protected void logLocalizedErrors(CrawlURI curi) {
912 if (curi.containsKey(A_LOCALIZED_ERRORS)) {
913 List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);
914 Iterator iter = localErrors.iterator();
915 while (iter.hasNext()) {
916 Object array[] = {curi, iter.next()};
917 controller.localErrors.log(Level.WARNING, curi.getUURI()
918 .toString(), array);
919 }
920
921 curi.remove(A_LOCALIZED_ERRORS);
922 }
923 }
924
925 /***
926 * Utility method to return a scratch dir for the given key's temp files.
927 * Every key gets its own subdir. To avoid having any one directory with
928 * thousands of files, there are also two levels of enclosing directory
929 * named by the least-significant hex digits of the key string's java
930 * hashcode.
931 *
932 * @param key
933 * @return File representing scratch directory
934 */
935 protected File scratchDirFor(String key) {
936 String hex = Integer.toHexString(key.hashCode());
937 while (hex.length() < 4) {
938 hex = "0" + hex;
939 }
940 int len = hex.length();
941 return new File(this.controller.getStateDisk(), hex.substring(len - 2,
942 len)
943 + File.separator
944 + hex.substring(len - 4, len - 2)
945 + File.separator + key);
946 }
947
948 protected boolean overMaxRetries(CrawlURI curi) {
949
950 if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi,
951 ATTR_MAX_RETRIES)).intValue()) {
952 return true;
953 }
954 return false;
955 }
956
957 public void importRecoverLog(String pathToLog, boolean retainFailures)
958 throws IOException {
959 File source = new File(pathToLog);
960 if (!source.isAbsolute()) {
961 source = new File(getSettingsHandler().getOrder().getController()
962 .getDisk(), pathToLog);
963 }
964 RecoveryJournal.importRecoverLog(source, controller, retainFailures);
965 }
966
967
968
969
970
971
972 public void kickUpdate() {
973
974
975 }
976
977 /***
978 * Log to the main crawl.log
979 *
980 * @param curi
981 */
982 protected void log(CrawlURI curi) {
983 curi.aboutToLog();
984 Object array[] = {curi};
985 this.controller.uriProcessing.log(Level.INFO,
986 curi.getUURI().toString(), array);
987 }
988
989 protected boolean isDisregarded(CrawlURI curi) {
990 switch (curi.getFetchStatus()) {
991 case S_ROBOTS_PRECLUDED:
992 case S_BLOCKED_BY_CUSTOM_PROCESSOR:
993 case S_OUT_OF_SCOPE:
994 case S_BLOCKED_BY_USER:
995 case S_TOO_MANY_EMBED_HOPS:
996 case S_TOO_MANY_LINK_HOPS:
997 case S_DELETED_BY_USER:
998 return true;
999 default:
1000 return false;
1001 }
1002 }
1003
1004 /***
1005 * Checks if a recently completed CrawlURI that did not finish successfully
1006 * needs to be retried (processed again after some time elapses)
1007 *
1008 * @param curi
1009 * The CrawlURI to check
1010 * @return True if we need to retry.
1011 */
1012 protected boolean needsRetrying(CrawlURI curi) {
1013 if (overMaxRetries(curi)) {
1014 return false;
1015 }
1016
1017 switch (curi.getFetchStatus()) {
1018 case HttpStatus.SC_UNAUTHORIZED:
1019
1020
1021
1022
1023
1024
1025 boolean loaded = curi.hasRfc2617CredentialAvatar();
1026 if (!loaded && logger.isLoggable(Level.INFO)) {
1027 logger.info("Have 401 but no creds loaded " + curi);
1028 }
1029 return loaded;
1030 case S_DEFERRED:
1031 case S_CONNECT_FAILED:
1032 case S_CONNECT_LOST:
1033 case S_DOMAIN_UNRESOLVABLE:
1034
1035
1036
1037 return true;
1038 default:
1039 return false;
1040 }
1041 }
1042
1043 /***
1044 * Canonicalize passed uuri. Its would be sweeter if this canonicalize
1045 * function was encapsulated by that which it canonicalizes but because
1046 * settings change with context -- i.e. there may be overrides in operation
1047 * for a particular URI -- its not so easy; Each CandidateURI would need a
1048 * reference to the settings system. That's awkward to pass in.
1049 *
1050 * @param uuri Candidate URI to canonicalize.
1051 * @return Canonicalized version of passed <code>uuri</code>.
1052 */
1053 protected String canonicalize(UURI uuri) {
1054 return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
1055 }
1056
1057 /***
1058 * Canonicalize passed CandidateURI. This method differs from
1059 * {@link #canonicalize(UURI)} in that it takes a look at
1060 * the CandidateURI context possibly overriding any canonicalization effect if
1061 * it could make us miss content. If canonicalization produces an URL that
1062 * was 'alreadyseen', but the entry in the 'alreadyseen' database did
1063 * nothing but redirect to the current URL, we won't get the current URL;
1064 * we'll think we've already see it. Examples would be archive.org
1065 * redirecting to www.archive.org or the inverse, www.netarkivet.net
1066 * redirecting to netarkivet.net (assuming stripWWW rule enabled).
1067 * <p>Note, this method under circumstance sets the forceFetch flag.
1068 *
1069 * @param cauri CandidateURI to examine.
1070 * @return Canonicalized <code>cacuri</code>.
1071 */
1072 protected String canonicalize(CandidateURI cauri) {
1073 String canon = canonicalize(cauri.getUURI());
1074 if (cauri.isLocation()) {
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085 if (!cauri.toString().equals(cauri.getVia().toString()) &&
1086 canonicalize(cauri.getVia()).equals(canon)) {
1087 cauri.setForceFetch(true);
1088 }
1089 }
1090 return canon;
1091 }
1092
1093 /***
1094 * @param cauri CrawlURI we're to get a key for.
1095 * @return a String token representing a queue
1096 */
1097 public String getClassKey(CandidateURI cauri) {
1098 String queueKey = (String)getUncheckedAttribute(cauri,
1099 ATTR_FORCE_QUEUE);
1100 if ("".equals(queueKey)) {
1101
1102 QueueAssignmentPolicy queueAssignmentPolicy =
1103 getQueueAssignmentPolicy(cauri);
1104 queueKey =
1105 queueAssignmentPolicy.getClassKey(this.controller, cauri);
1106 }
1107 return queueKey;
1108 }
1109
1110 protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri) {
1111 String clsName = (String)getUncheckedAttribute(cauri,
1112 ATTR_QUEUE_ASSIGNMENT_POLICY);
1113 try {
1114 return (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
1115 } catch (Exception e) {
1116 throw new RuntimeException(e);
1117 }
1118 }
1119
1120 /***
1121 * @return RecoveryJournal instance. May be null.
1122 */
1123 public FrontierJournal getFrontierJournal() {
1124 return this.recover;
1125 }
1126
1127 public void crawlEnding(String sExitMessage) {
1128
1129 }
1130
1131 public void crawlEnded(String sExitMessage) {
1132 if (logger.isLoggable(Level.INFO)) {
1133 logger.info("Closing with " + Long.toString(queuedUriCount()) +
1134 " urls still in queue.");
1135 }
1136 }
1137
1138 public void crawlStarted(String message) {
1139
1140 }
1141
1142 public void crawlPausing(String statusMessage) {
1143
1144 }
1145
1146 public void crawlPaused(String statusMessage) {
1147
1148 }
1149
1150 public void crawlResuming(String statusMessage) {
1151
1152 }
1153
1154 public void crawlCheckpoint(File checkpointDir)
1155 throws Exception {
1156 if (this.recover == null) {
1157 return;
1158 }
1159 this.recover.checkpoint(checkpointDir);
1160 }
1161
1162
1163
1164
1165 public String singleLineReport() {
1166 return ArchiveUtils.singleLineReport(this);
1167 }
1168
1169 public void reportTo(PrintWriter writer) {
1170 reportTo(null, writer);
1171 }
1172
1173
1174
1175 private void writeObject(java.io.ObjectOutputStream out)
1176 throws IOException {
1177 queuedUriCount = liveQueuedUriCount.get();
1178 succeededFetchCount = liveSucceededFetchCount.get();
1179 failedFetchCount = liveFailedFetchCount.get();
1180 disregardedUriCount = liveDisregardedUriCount.get();
1181 out.defaultWriteObject();
1182 }
1183 private void readObject(java.io.ObjectInputStream in)
1184 throws IOException, ClassNotFoundException {
1185 in.defaultReadObject();
1186 liveQueuedUriCount = new AtomicLong(queuedUriCount);
1187 liveSucceededFetchCount = new AtomicLong(succeededFetchCount);
1188 liveFailedFetchCount = new AtomicLong(failedFetchCount);
1189 liveDisregardedUriCount = new AtomicLong(disregardedUriCount);
1190 }
1191 }