1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.io.File;
27 import java.io.FileOutputStream;
28 import java.io.FilenameFilter;
29 import java.io.IOException;
30 import java.io.ObjectInputStream;
31 import java.io.PrintWriter;
32 import java.io.Serializable;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.Collections;
36 import java.util.EventObject;
37 import java.util.HashMap;
38 import java.util.HashSet;
39 import java.util.Hashtable;
40 import java.util.Iterator;
41 import java.util.LinkedList;
42 import java.util.List;
43 import java.util.Map;
44 import java.util.Set;
45 import java.util.TreeSet;
46 import java.util.concurrent.locks.ReentrantLock;
47 import java.util.logging.FileHandler;
48 import java.util.logging.Formatter;
49 import java.util.logging.Level;
50 import java.util.logging.Logger;
51
52 import javax.management.AttributeNotFoundException;
53 import javax.management.InvalidAttributeValueException;
54 import javax.management.MBeanException;
55 import javax.management.ReflectionException;
56
57 import org.apache.commons.httpclient.URIException;
58 import org.archive.crawler.admin.CrawlJob;
59 import org.archive.crawler.admin.StatisticsTracker;
60 import org.archive.crawler.datamodel.Checkpoint;
61 import org.archive.crawler.datamodel.CrawlOrder;
62 import org.archive.crawler.datamodel.CrawlURI;
63 import org.archive.crawler.datamodel.ServerCache;
64 import org.archive.crawler.event.CrawlStatusListener;
65 import org.archive.crawler.event.CrawlURIDispositionListener;
66 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
67 import org.archive.crawler.framework.exceptions.InitializationException;
68 import org.archive.crawler.io.LocalErrorFormatter;
69 import org.archive.crawler.io.RuntimeErrorFormatter;
70 import org.archive.crawler.io.StatisticsLogFormatter;
71 import org.archive.crawler.io.UriErrorFormatter;
72 import org.archive.crawler.io.UriProcessingFormatter;
73 import org.archive.crawler.settings.MapType;
74 import org.archive.crawler.settings.SettingsHandler;
75 import org.archive.crawler.util.CheckpointUtils;
76 import org.archive.io.GenerationFileHandler;
77 import org.archive.net.UURI;
78 import org.archive.net.UURIFactory;
79 import org.archive.util.ArchiveUtils;
80 import org.archive.util.CachedBdbMap;
81 import org.archive.util.FileUtils;
82 import org.archive.util.Reporter;
83 import org.archive.util.bdbje.EnhancedEnvironment;
84 import org.xbill.DNS.DClass;
85 import org.xbill.DNS.Lookup;
86
87 import com.sleepycat.bind.serial.StoredClassCatalog;
88 import com.sleepycat.je.CheckpointConfig;
89 import com.sleepycat.je.Database;
90 import com.sleepycat.je.DatabaseException;
91 import com.sleepycat.je.DbInternal;
92 import com.sleepycat.je.EnvironmentConfig;
93 import com.sleepycat.je.dbi.EnvironmentImpl;
94 import com.sleepycat.je.utilint.DbLsn;
95
96 /***
97 * CrawlController collects all the classes which cooperate to
98 * perform a crawl and provides a high-level interface to the
99 * running crawl.
100 *
101 * As the "global context" for a crawl, subcomponents will
102 * often reach each other through the CrawlController.
103 *
104 * @author Gordon Mohr
105 */
106 public class CrawlController implements Serializable, Reporter {
107
108 private static final long serialVersionUID =
109 ArchiveUtils.classnameBasedUID(CrawlController.class,1);
110
111 /***
112 * Messages from the crawlcontroller.
113 *
114 * They appear on console.
115 */
116 private final static Logger LOGGER =
117 Logger.getLogger(CrawlController.class.getName());
118
119
120 /*** abbrieviation label for config files in manifest */
121 public static final char MANIFEST_CONFIG_FILE = 'C';
122 /*** abbrieviation label for report files in manifest */
123 public static final char MANIFEST_REPORT_FILE = 'R';
124 /*** abbrieviation label for log files in manifest */
125 public static final char MANIFEST_LOG_FILE = 'L';
126
127
128 private static final String LOGNAME_PROGRESS_STATISTICS =
129 "progress-statistics";
130 private static final String LOGNAME_URI_ERRORS = "uri-errors";
131 private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
132 private static final String LOGNAME_LOCAL_ERRORS = "local-errors";
133 private static final String LOGNAME_CRAWL = "crawl";
134
135
136 private transient CrawlOrder order;
137 private transient CrawlScope scope;
138 private transient ProcessorChainList processorChains;
139
140 private transient Frontier frontier;
141
142 private transient ToePool toePool;
143
144 private transient ServerCache serverCache;
145
146
147 private transient SettingsHandler settingsHandler;
148
149
150
151 private volatile transient boolean singleThreadMode = false;
152 private transient ReentrantLock singleThreadLock = null;
153
154
155 private transient LinkedList<char[]> reserveMemory;
156 private static final int RESERVE_BLOCKS = 1;
157 private static final int RESERVE_BLOCK_SIZE = 6*2^20;
158
159
160
161 /***
162 * Crawl exit status.
163 */
164 private transient String sExit;
165
166 private static final Object NASCENT = "NASCENT".intern();
167 private static final Object RUNNING = "RUNNING".intern();
168 private static final Object PAUSED = "PAUSED".intern();
169 private static final Object PAUSING = "PAUSING".intern();
170 private static final Object CHECKPOINTING = "CHECKPOINTING".intern();
171 private static final Object STOPPING = "STOPPING".intern();
172 private static final Object FINISHED = "FINISHED".intern();
173 private static final Object STARTED = "STARTED".intern();
174 private static final Object PREPARING = "PREPARING".intern();
175
176 transient private Object state = NASCENT;
177
178
179 private transient File disk;
180 private transient File logsDisk;
181
182 /***
183 * For temp files representing state of crawler (eg queues)
184 */
185 private transient File stateDisk;
186
187 /***
188 * For discardable temp files (eg fetch buffers).
189 */
190 private transient File scratchDisk;
191
192 /***
193 * Directory that holds checkpoint.
194 */
195 private transient File checkpointsDisk;
196
197 /***
198 * Checkpointer.
199 * Knows if checkpoint in progress and what name of checkpoint is. Also runs
200 * checkpoints.
201 */
202 private Checkpointer checkpointer;
203
204 /***
205 * Gets set to checkpoint we're in recovering if in checkpoint recover
206 * mode. Gets setup by {@link #getCheckpointRecover()}.
207 */
208 private transient Checkpoint checkpointRecover = null;
209
210
211 private long maxBytes;
212 private long maxDocument;
213 private long maxTime;
214
215 /***
216 * A manifest of all files used/created during this crawl. Written to file
217 * at the end of the crawl (the absolutely last thing done).
218 */
219 private StringBuffer manifest;
220
221 /***
222 * Record of fileHandlers established for loggers,
223 * assisting file rotation.
224 */
225 transient private Map<Logger,FileHandler> fileHandlers;
226
227 /*** suffix to use on active logs */
228 public static final String CURRENT_LOG_SUFFIX = ".log";
229
230 /***
231 * Crawl progress logger.
232 *
233 * No exceptions. Logs summary result of each url processing.
234 */
235 public transient Logger uriProcessing;
236
237 /***
238 * This logger contains unexpected runtime errors.
239 *
240 * Would contain errors trying to set up a job or failures inside
241 * processors that they are not prepared to recover from.
242 */
243 public transient Logger runtimeErrors;
244
245 /***
246 * This logger is for job-scoped logging, specifically errors which
247 * happen and are handled within a particular processor.
248 *
249 * Examples would be socket timeouts, exceptions thrown by extractors, etc.
250 */
251 public transient Logger localErrors;
252
253 /***
254 * Special log for URI format problems, wherever they may occur.
255 */
256 public transient Logger uriErrors;
257
258 /***
259 * Statistics tracker writes here at regular intervals.
260 */
261 private transient Logger progressStats;
262
263 /***
264 * Logger to hold job summary report.
265 *
266 * Large state reports made at infrequent intervals (e.g. job ending) go
267 * here.
268 */
269 public transient Logger reports;
270
271 protected StatisticsTracking statistics = null;
272
273 /***
274 * List of crawl status listeners.
275 *
276 * All iterations need to synchronize on this object if they're to avoid
277 * concurrent modification exceptions.
278 * See {@link java.util.Collections#synchronizedList(List)}.
279 */
280 private transient List<CrawlStatusListener> registeredCrawlStatusListeners =
281 Collections.synchronizedList(new ArrayList<CrawlStatusListener>());
282
283
284
285 private transient CrawlURIDispositionListener
286 registeredCrawlURIDispositionListener;
287
288
289 protected transient ArrayList<CrawlURIDispositionListener>
290 registeredCrawlURIDispositionListeners;
291
292 /*** Shared bdb Environment for Frontier subcomponents */
293
294
295 private transient EnhancedEnvironment bdbEnvironment = null;
296
297 /***
298 * Keep a list of all BigMap instance made -- shouldn't be many -- so that
299 * we can checkpoint.
300 */
301 private transient Map<String,CachedBdbMap<?,?>> bigmaps = null;
302
303 /***
304 * Default constructor
305 */
306 public CrawlController() {
307 super();
308
309 }
310
311 /***
312 * Starting from nothing, set up CrawlController and associated
313 * classes to be ready for a first crawl.
314 *
315 * @param sH Settings handler.
316 * @throws InitializationException
317 */
318 public void initialize(SettingsHandler sH)
319 throws InitializationException {
320 sendCrawlStateChangeEvent(PREPARING, CrawlJob.STATUS_PREPARING);
321
322 this.singleThreadLock = new ReentrantLock();
323 this.settingsHandler = sH;
324 installThreadContextSettingsHandler();
325 this.order = settingsHandler.getOrder();
326 this.order.setController(this);
327 this.bigmaps = new Hashtable<String,CachedBdbMap<?,?>>();
328 sExit = "";
329 this.manifest = new StringBuffer();
330 String onFailMessage = "";
331 try {
332 onFailMessage = "You must set the User-Agent and From HTTP" +
333 " header values to acceptable strings. \n" +
334 " User-Agent: [software-name](+[info-url])[misc]\n" +
335 " From: [email-address]\n";
336 order.checkUserAgentAndFrom();
337
338 onFailMessage = "Unable to setup disk";
339 if (disk == null) {
340 setupDisk();
341 }
342
343 onFailMessage = "Unable to create log file(s)";
344 setupLogs();
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362 onFailMessage = "Unable to test/run checkpoint recover";
363 this.checkpointRecover = getCheckpointRecover();
364 if (this.checkpointRecover == null) {
365 this.checkpointer =
366 new Checkpointer(this, this.checkpointsDisk);
367 } else {
368 setupCheckpointRecover();
369 }
370
371 onFailMessage = "Unable to setup bdb environment.";
372 setupBdb();
373
374 onFailMessage = "Unable to setup statistics";
375 setupStatTracking();
376
377 onFailMessage = "Unable to setup crawl modules";
378 setupCrawlModules();
379 } catch (Exception e) {
380 String tmp = "On crawl: "
381 + settingsHandler.getSettingsObject(null).getName() + " " +
382 onFailMessage;
383 LOGGER.log(Level.SEVERE, tmp, e);
384 throw new InitializationException(tmp, e);
385 }
386
387
388
389 Lookup.getDefaultCache(DClass.IN).setMaxEntries(1);
390
391
392 setupToePool();
393 setThresholds();
394
395 reserveMemory = new LinkedList<char[]>();
396 for(int i = 1; i < RESERVE_BLOCKS; i++) {
397 reserveMemory.add(new char[RESERVE_BLOCK_SIZE]);
398 }
399 }
400
401 /***
402 * Utility method to install this crawl's SettingsHandler into the
403 * 'global' (for this thread) holder, so that any subsequent
404 * deserialization operations in this thread can find it.
405 *
406 * @param sH
407 */
408 public void installThreadContextSettingsHandler() {
409 SettingsHandler.setThreadContextSettingsHandler(settingsHandler);
410 }
411
412 /***
413 * Does setup of checkpoint recover.
414 * Copies bdb log files into state dir.
415 * @throws IOException
416 */
417 protected void setupCheckpointRecover()
418 throws IOException {
419 long started = System.currentTimeMillis();;
420 if (LOGGER.isLoggable(Level.FINE)) {
421 LOGGER.fine("Starting recovery setup -- copying into place " +
422 "bdbje log files -- for checkpoint named " +
423 this.checkpointRecover.getDisplayName());
424 }
425
426 this.checkpointer.recover(this);
427 this.progressStats.info("CHECKPOINT RECOVER " +
428 this.checkpointRecover.getDisplayName());
429
430
431
432
433
434
435 File bdbSubDir = CheckpointUtils.
436 getBdbSubDirectory(this.checkpointRecover.getDirectory());
437 List<IOException> errs = new ArrayList<IOException>();
438 FileUtils.copyFiles(bdbSubDir, CheckpointUtils.getJeLogsFilter(),
439 getStateDisk(), true, false, errs);
440 for (IOException ioe : errs) {
441 LOGGER.log(Level.SEVERE, "Problem copying checkpoint files: "
442 +"checkpoint may be corrupt",ioe);
443 }
444 if (LOGGER.isLoggable(Level.INFO)) {
445 LOGGER.info("Finished recovery setup for checkpoint named " +
446 this.checkpointRecover.getDisplayName() + " in " +
447 (System.currentTimeMillis() - started) + "ms.");
448 }
449 }
450
451 protected boolean getCheckpointCopyBdbjeLogs() {
452 return ((Boolean)this.order.getUncheckedAttribute(null,
453 CrawlOrder.ATTR_CHECKPOINT_COPY_BDBJE_LOGS)).booleanValue();
454 }
455
456 private void setupBdb()
457 throws FatalConfigurationException, AttributeNotFoundException {
458 EnvironmentConfig envConfig = new EnvironmentConfig();
459 envConfig.setAllowCreate(true);
460 int bdbCachePercent = ((Integer)this.order.
461 getAttribute(null, CrawlOrder.ATTR_BDB_CACHE_PERCENT)).intValue();
462 if(bdbCachePercent > 0) {
463
464
465 envConfig.setCachePercent(bdbCachePercent);
466 }
467 envConfig.setSharedCache(true);
468 envConfig.setLockTimeout(5000000);
469 if (LOGGER.isLoggable(Level.FINEST)) {
470 envConfig.setConfigParam("java.util.logging.level", "SEVERE");
471 envConfig.setConfigParam("java.util.logging.level.evictor",
472 "SEVERE");
473 envConfig.setConfigParam("java.util.logging.ConsoleHandler.on",
474 "true");
475 }
476
477 if (!getCheckpointCopyBdbjeLogs()) {
478
479
480
481
482 envConfig.setConfigParam("je.cleaner.expunge", "false");
483 }
484
485 try {
486 this.bdbEnvironment = new EnhancedEnvironment(getStateDisk(), envConfig);
487 if (LOGGER.isLoggable(Level.FINE)) {
488
489 envConfig = bdbEnvironment.getConfig();
490 LOGGER.fine("BdbConfiguration: Cache percentage " +
491 envConfig.getCachePercent() +
492 ", cache size " + envConfig.getCacheSize());
493 }
494 } catch (DatabaseException e) {
495 e.printStackTrace();
496 throw new FatalConfigurationException(e.getMessage());
497 }
498 }
499
500 /***
501 * @return the shared EnhancedEnvironment
502 */
503 public EnhancedEnvironment getBdbEnvironment() {
504 return this.bdbEnvironment;
505 }
506
507 /***
508 * @deprecated use EnhancedEnvironment's getClassCatalog() instead
509 */
510 public StoredClassCatalog getClassCatalog() {
511 return this.bdbEnvironment.getClassCatalog();
512 }
513
514 /***
515 * Register for CrawlStatus events.
516 *
517 * @param cl a class implementing the CrawlStatusListener interface
518 *
519 * @see CrawlStatusListener
520 */
521 public void addCrawlStatusListener(CrawlStatusListener cl) {
522 synchronized (this.registeredCrawlStatusListeners) {
523 this.registeredCrawlStatusListeners.add(cl);
524 }
525 }
526
527 /***
528 * Register for CrawlURIDisposition events.
529 *
530 * @param cl a class implementing the CrawlURIDispostionListener interface
531 *
532 * @see CrawlURIDispositionListener
533 */
534 public void addCrawlURIDispositionListener(CrawlURIDispositionListener cl) {
535 registeredCrawlURIDispositionListener = null;
536 if (registeredCrawlURIDispositionListeners == null) {
537
538 registeredCrawlURIDispositionListener = cl;
539
540 registeredCrawlURIDispositionListeners
541 = new ArrayList<CrawlURIDispositionListener>(1);
542
543 }
544 registeredCrawlURIDispositionListeners.add(cl);
545 }
546
547 /***
548 * Allows an external class to raise a CrawlURIDispostion
549 * crawledURISuccessful event that will be broadcast to all listeners that
550 * have registered with the CrawlController.
551 *
552 * @param curi - The CrawlURI that will be sent with the event notification.
553 *
554 * @see CrawlURIDispositionListener#crawledURISuccessful(CrawlURI)
555 */
556 public void fireCrawledURISuccessfulEvent(CrawlURI curi) {
557 if (registeredCrawlURIDispositionListener != null) {
558
559 registeredCrawlURIDispositionListener.crawledURISuccessful(curi);
560 } else {
561
562 if (registeredCrawlURIDispositionListeners != null
563 && registeredCrawlURIDispositionListeners.size() > 0) {
564 Iterator it = registeredCrawlURIDispositionListeners.iterator();
565 while (it.hasNext()) {
566 (
567 (CrawlURIDispositionListener) it
568 .next())
569 .crawledURISuccessful(
570 curi);
571 }
572 }
573 }
574 }
575
576 /***
577 * Allows an external class to raise a CrawlURIDispostion
578 * crawledURINeedRetry event that will be broadcast to all listeners that
579 * have registered with the CrawlController.
580 *
581 * @param curi - The CrawlURI that will be sent with the event notification.
582 *
583 * @see CrawlURIDispositionListener#crawledURINeedRetry(CrawlURI)
584 */
585 public void fireCrawledURINeedRetryEvent(CrawlURI curi) {
586 if (registeredCrawlURIDispositionListener != null) {
587
588 registeredCrawlURIDispositionListener.crawledURINeedRetry(curi);
589 return;
590 }
591
592
593 if (registeredCrawlURIDispositionListeners != null
594 && registeredCrawlURIDispositionListeners.size() > 0) {
595 for (Iterator i = registeredCrawlURIDispositionListeners.iterator();
596 i.hasNext();) {
597 ((CrawlURIDispositionListener)i.next()).crawledURINeedRetry(curi);
598 }
599 }
600 }
601
602 /***
603 * Allows an external class to raise a CrawlURIDispostion
604 * crawledURIDisregard event that will be broadcast to all listeners that
605 * have registered with the CrawlController.
606 *
607 * @param curi -
608 * The CrawlURI that will be sent with the event notification.
609 *
610 * @see CrawlURIDispositionListener#crawledURIDisregard(CrawlURI)
611 */
612 public void fireCrawledURIDisregardEvent(CrawlURI curi) {
613 if (registeredCrawlURIDispositionListener != null) {
614
615 registeredCrawlURIDispositionListener.crawledURIDisregard(curi);
616 } else {
617
618 if (registeredCrawlURIDispositionListeners != null
619 && registeredCrawlURIDispositionListeners.size() > 0) {
620 Iterator it = registeredCrawlURIDispositionListeners.iterator();
621 while (it.hasNext()) {
622 (
623 (CrawlURIDispositionListener) it
624 .next())
625 .crawledURIDisregard(
626 curi);
627 }
628 }
629 }
630 }
631
632 /***
633 * Allows an external class to raise a CrawlURIDispostion crawledURIFailure event
634 * that will be broadcast to all listeners that have registered with the CrawlController.
635 *
636 * @param curi - The CrawlURI that will be sent with the event notification.
637 *
638 * @see CrawlURIDispositionListener#crawledURIFailure(CrawlURI)
639 */
640 public void fireCrawledURIFailureEvent(CrawlURI curi) {
641 if (registeredCrawlURIDispositionListener != null) {
642
643 registeredCrawlURIDispositionListener.crawledURIFailure(curi);
644 } else {
645
646 if (registeredCrawlURIDispositionListeners != null
647 && registeredCrawlURIDispositionListeners.size() > 0) {
648 Iterator it = registeredCrawlURIDispositionListeners.iterator();
649 while (it.hasNext()) {
650 ((CrawlURIDispositionListener)it.next())
651 .crawledURIFailure(curi);
652 }
653 }
654 }
655 }
656
657 private void setupCrawlModules() throws FatalConfigurationException,
658 AttributeNotFoundException, MBeanException, ReflectionException {
659 if (scope == null) {
660 scope = (CrawlScope) order.getAttribute(CrawlScope.ATTR_NAME);
661 scope.initialize(this);
662 }
663 try {
664 this.serverCache = new ServerCache(this);
665 } catch (Exception e) {
666 throw new FatalConfigurationException("Unable to" +
667 " initialize frontier (Failed setup of ServerCache) " + e);
668 }
669
670 if (this.frontier == null) {
671 this.frontier = (Frontier)order.getAttribute(Frontier.ATTR_NAME);
672 try {
673 frontier.initialize(this);
674 frontier.pause();
675
676
677
678 if (!isCheckpointRecover()) {
679 runFrontierRecover((String)order.
680 getAttribute(CrawlOrder.ATTR_RECOVER_PATH));
681 }
682 } catch (IOException e) {
683 throw new FatalConfigurationException(
684 "unable to initialize frontier: " + e);
685 }
686 }
687
688
689 if (processorChains == null) {
690 processorChains = new ProcessorChainList(order);
691 }
692 }
693
694 protected void runFrontierRecover(String recoverPath)
695 throws AttributeNotFoundException, MBeanException,
696 ReflectionException, FatalConfigurationException {
697 if (recoverPath == null || recoverPath.length() <= 0) {
698 return;
699 }
700 File f = new File(recoverPath);
701 if (!f.exists()) {
702 LOGGER.severe("Recover file does not exist " + f.getAbsolutePath());
703 return;
704 }
705 if (!f.isFile()) {
706
707 return;
708 }
709 boolean retainFailures = ((Boolean)order.
710 getAttribute(CrawlOrder.ATTR_RECOVER_RETAIN_FAILURES)).booleanValue();
711 try {
712 frontier.importRecoverLog(f.getAbsolutePath(), retainFailures);
713 } catch (IOException e) {
714 e.printStackTrace();
715 throw (FatalConfigurationException) new FatalConfigurationException(
716 "Recover.log " + recoverPath + " problem: " + e).initCause(e);
717 }
718 }
719
720 private void setupDisk() throws AttributeNotFoundException {
721 String diskPath
722 = (String) order.getAttribute(null, CrawlOrder.ATTR_DISK_PATH);
723 this.disk = getSettingsHandler().
724 getPathRelativeToWorkingDirectory(diskPath);
725 this.disk.mkdirs();
726 this.logsDisk = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
727 this.checkpointsDisk = getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH);
728 this.stateDisk = getSettingsDir(CrawlOrder.ATTR_STATE_PATH);
729 this.scratchDisk = getSettingsDir(CrawlOrder.ATTR_SCRATCH_PATH);
730 }
731
732 /***
733 * @return The logging directory or null if problem reading the settings.
734 */
735 public File getLogsDir() {
736 File f = null;
737 try {
738 f = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
739 } catch (AttributeNotFoundException e) {
740 LOGGER.severe("Failed get of logs directory: " + e.getMessage());
741 }
742 return f;
743 }
744
745 /***
746 * Return fullpath to the directory named by <code>key</code>
747 * in settings.
748 * If directory does not exist, it and all intermediary dirs
749 * will be created.
750 * @param key Key to use going to settings.
751 * @return Full path to directory named by <code>key</code>.
752 * @throws AttributeNotFoundException
753 */
754 public File getSettingsDir(String key)
755 throws AttributeNotFoundException {
756 String path = (String)order.getAttribute(null, key);
757 File f = new File(path);
758 if (!f.isAbsolute()) {
759 f = new File(disk.getPath(), path);
760 }
761 if (!f.exists()) {
762 f.mkdirs();
763 }
764 return f;
765 }
766
767 /***
768 * Setup the statistics tracker.
769 * The statistics object must be created before modules can use it.
770 * Do it here now so that when modules retrieve the object from the
771 * controller during initialization (which some do), its in place.
772 * @throws InvalidAttributeValueException
773 * @throws FatalConfigurationException
774 */
775 private void setupStatTracking()
776 throws InvalidAttributeValueException, FatalConfigurationException {
777 MapType loggers = order.getLoggers();
778 final String cstName = "crawl-statistics";
779 if (loggers.isEmpty(null)) {
780 if (!isCheckpointRecover() && this.statistics == null) {
781 this.statistics = new StatisticsTracker(cstName);
782 }
783 loggers.addElement(null, (StatisticsTracker)this.statistics);
784 }
785
786 if (isCheckpointRecover()) {
787 restoreStatisticsTracker(loggers, cstName);
788 }
789
790 for (Iterator it = loggers.iterator(null); it.hasNext();) {
791 StatisticsTracking tracker = (StatisticsTracking)it.next();
792 tracker.initialize(this);
793 if (this.statistics == null) {
794 this.statistics = tracker;
795 }
796 }
797 }
798
799 protected void restoreStatisticsTracker(MapType loggers,
800 String replaceName)
801 throws FatalConfigurationException {
802 try {
803
804 loggers.removeElement(loggers.globalSettings(), replaceName);
805 loggers.addElement(loggers.globalSettings(),
806 (StatisticsTracker)this.statistics);
807 } catch (Exception e) {
808 throw convertToFatalConfigurationException(e);
809 }
810 }
811
812 protected FatalConfigurationException
813 convertToFatalConfigurationException(Exception e) {
814 FatalConfigurationException fce =
815 new FatalConfigurationException("Converted exception: " +
816 e.getMessage());
817 fce.setStackTrace(e.getStackTrace());
818 return fce;
819 }
820
821 private void setupLogs() throws IOException {
822 String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
823 uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
824 runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." +
825 logsPath);
826 localErrors = Logger.getLogger(LOGNAME_LOCAL_ERRORS + "." + logsPath);
827 uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
828 progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
829 logsPath);
830
831 this.fileHandlers = new HashMap<Logger,FileHandler>();
832
833 setupLogFile(uriProcessing,
834 logsPath + LOGNAME_CRAWL + CURRENT_LOG_SUFFIX,
835 new UriProcessingFormatter(), true);
836
837 setupLogFile(runtimeErrors,
838 logsPath + LOGNAME_RUNTIME_ERRORS + CURRENT_LOG_SUFFIX,
839 new RuntimeErrorFormatter(), true);
840
841 setupLogFile(localErrors,
842 logsPath + LOGNAME_LOCAL_ERRORS + CURRENT_LOG_SUFFIX,
843 new LocalErrorFormatter(), true);
844
845 setupLogFile(uriErrors,
846 logsPath + LOGNAME_URI_ERRORS + CURRENT_LOG_SUFFIX,
847 new UriErrorFormatter(), true);
848
849 setupLogFile(progressStats,
850 logsPath + LOGNAME_PROGRESS_STATISTICS + CURRENT_LOG_SUFFIX,
851 new StatisticsLogFormatter(), true);
852
853 }
854
855 private void setupLogFile(Logger logger, String filename, Formatter f,
856 boolean shouldManifest) throws IOException, SecurityException {
857 GenerationFileHandler fh = new GenerationFileHandler(filename, true,
858 shouldManifest);
859 fh.setFormatter(f);
860 logger.addHandler(fh);
861 addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
862 logger.setUseParentHandlers(false);
863 this.fileHandlers.put(logger, fh);
864 }
865
866 protected void rotateLogFiles(String generationSuffix)
867 throws IOException {
868 if (this.state != PAUSED && this.state != CHECKPOINTING) {
869 throw new IllegalStateException("Pause crawl before requesting " +
870 "log rotation.");
871 }
872 for (Iterator i = fileHandlers.keySet().iterator(); i.hasNext();) {
873 Logger l = (Logger)i.next();
874 GenerationFileHandler gfh =
875 (GenerationFileHandler)fileHandlers.get(l);
876 GenerationFileHandler newGfh =
877 gfh.rotate(generationSuffix, CURRENT_LOG_SUFFIX);
878 if (gfh.shouldManifest()) {
879 addToManifest((String) newGfh.getFilenameSeries().get(1),
880 MANIFEST_LOG_FILE, newGfh.shouldManifest());
881 }
882 l.removeHandler(gfh);
883 l.addHandler(newGfh);
884 fileHandlers.put(l, newGfh);
885 }
886 }
887
888 /***
889 * Close all log files and remove handlers from loggers.
890 */
891 public void closeLogFiles() {
892 for (Iterator i = fileHandlers.keySet().iterator(); i.hasNext();) {
893 Logger l = (Logger)i.next();
894 GenerationFileHandler gfh =
895 (GenerationFileHandler)fileHandlers.get(l);
896 gfh.close();
897 l.removeHandler(gfh);
898 }
899 }
900
901 /***
902 * Sets the values for max bytes, docs and time based on crawl order.
903 */
904 private void setThresholds() {
905 try {
906 maxBytes =
907 ((Long) order.getAttribute(CrawlOrder.ATTR_MAX_BYTES_DOWNLOAD))
908 .longValue();
909 } catch (Exception e) {
910 maxBytes = 0;
911 }
912 try {
913 maxDocument =
914 ((Long) order
915 .getAttribute(CrawlOrder.ATTR_MAX_DOCUMENT_DOWNLOAD))
916 .longValue();
917 } catch (Exception e) {
918 maxDocument = 0;
919 }
920 try {
921 maxTime =
922 ((Long) order.getAttribute(CrawlOrder.ATTR_MAX_TIME_SEC))
923 .longValue();
924 } catch (Exception e) {
925 maxTime = 0;
926 }
927 }
928
929 /***
930 * @return Object this controller is using to track crawl statistics
931 */
932 public StatisticsTracking getStatistics() {
933 return statistics==null ?
934 new StatisticsTracker("crawl-statistics"): this.statistics;
935 }
936
937 /***
938 * Send crawl change event to all listeners.
939 * @param newState State change we're to tell listeners' about.
940 * @param message Message on state change.
941 * @see #sendCheckpointEvent(File) for special case event sending
942 * telling listeners to checkpoint.
943 */
944 protected void sendCrawlStateChangeEvent(Object newState, String message) {
945 synchronized (this.registeredCrawlStatusListeners) {
946 this.state = newState;
947 for (Iterator i = this.registeredCrawlStatusListeners.iterator();
948 i.hasNext();) {
949 CrawlStatusListener l = (CrawlStatusListener)i.next();
950 if (newState.equals(PAUSED)) {
951 l.crawlPaused(message);
952 } else if (newState.equals(RUNNING)) {
953 l.crawlResuming(message);
954 } else if (newState.equals(PAUSING)) {
955 l.crawlPausing(message);
956 } else if (newState.equals(STARTED)) {
957 l.crawlStarted(message);
958 } else if (newState.equals(STOPPING)) {
959 l.crawlEnding(message);
960 } else if (newState.equals(FINISHED)) {
961 l.crawlEnded(message);
962 } else if (newState.equals(PREPARING)) {
963 l.crawlResuming(message);
964 } else {
965 throw new RuntimeException("Unknown state: " + newState);
966 }
967 if (LOGGER.isLoggable(Level.FINE)) {
968 LOGGER.fine("Sent " + newState + " to " + l);
969 }
970 }
971 LOGGER.fine("Sent " + newState);
972 }
973 }
974
975 /***
976 * Send the checkpoint event.
977 * Has its own method apart from
978 * {@link #sendCrawlStateChangeEvent(Object, String)} because checkpointing
979 * throws an Exception (Didn't want to have to wrap all of the
980 * sendCrawlStateChangeEvent in try/catches).
981 * @param checkpointDir Where to write checkpoint state to.
982 * @throws Exception
983 */
984 protected void sendCheckpointEvent(File checkpointDir) throws Exception {
985 synchronized (this.registeredCrawlStatusListeners) {
986 if (this.state != PAUSED) {
987 throw new IllegalStateException("Crawler must be completly " +
988 "paused before checkpointing can start");
989 }
990 this.state = CHECKPOINTING;
991 for (Iterator i = this.registeredCrawlStatusListeners.iterator();
992 i.hasNext();) {
993 CrawlStatusListener l = (CrawlStatusListener)i.next();
994 l.crawlCheckpoint(checkpointDir);
995 if (LOGGER.isLoggable(Level.FINE)) {
996 LOGGER.fine("Sent " + CHECKPOINTING + " to " + l);
997 }
998 }
999 LOGGER.fine("Sent " + CHECKPOINTING);
1000 }
1001 }
1002
1003 /***
1004 * Operator requested crawl begin
1005 */
1006 public void requestCrawlStart() {
1007 runProcessorInitialTasks();
1008
1009 sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
1010 String jobState;
1011 state = RUNNING;
1012 jobState = CrawlJob.STATUS_RUNNING;
1013 sendCrawlStateChangeEvent(this.state, jobState);
1014
1015
1016 this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
1017
1018 Thread statLogger = new Thread(statistics);
1019 statLogger.setName("StatLogger");
1020 statLogger.start();
1021
1022 frontier.start();
1023 }
1024
1025 /***
1026 * Called when the last toethread exits.
1027 */
1028 protected void completeStop() {
1029 LOGGER.fine("Entered complete stop.");
1030
1031 runProcessorFinalTasks();
1032
1033 sendCrawlStateChangeEvent(FINISHED, this.sExit);
1034 synchronized (this.registeredCrawlStatusListeners) {
1035
1036 this.registeredCrawlStatusListeners.
1037 removeAll(this.registeredCrawlStatusListeners);
1038 this.registeredCrawlStatusListeners = null;
1039 }
1040
1041 closeLogFiles();
1042
1043
1044 this.fileHandlers = null;
1045 this.uriErrors = null;
1046 this.uriProcessing = null;
1047 this.localErrors = null;
1048 this.runtimeErrors = null;
1049 this.progressStats = null;
1050 this.reports = null;
1051 this.manifest = null;
1052
1053
1054 this.statistics = null;
1055 this.frontier = null;
1056 this.disk = null;
1057 this.scratchDisk = null;
1058 this.order = null;
1059 this.scope = null;
1060 if (this.settingsHandler != null) {
1061 this.settingsHandler.cleanup();
1062 }
1063 this.settingsHandler = null;
1064 this.reserveMemory = null;
1065 this.processorChains = null;
1066 if (this.serverCache != null) {
1067 this.serverCache.cleanup();
1068 this.serverCache = null;
1069 }
1070 if (this.checkpointer != null) {
1071 this.checkpointer.cleanup();
1072 this.checkpointer = null;
1073 }
1074 if (this.bdbEnvironment != null) {
1075 try {
1076 this.bdbEnvironment.sync();
1077 this.bdbEnvironment.close();
1078 } catch (DatabaseException e) {
1079 e.printStackTrace();
1080 }
1081 this.bdbEnvironment = null;
1082 }
1083 this.bigmaps = null;
1084 if (this.toePool != null) {
1085 this.toePool.cleanup();
1086
1087
1088
1089
1090
1091
1092 }
1093 this.toePool = null;
1094 LOGGER.fine("Finished crawl.");
1095 }
1096
1097 synchronized void completePause() {
1098
1099
1100 notifyAll();
1101 sendCrawlStateChangeEvent(PAUSED, CrawlJob.STATUS_PAUSED);
1102 }
1103
1104 private boolean shouldContinueCrawling() {
1105 if (frontier.isEmpty()) {
1106 this.sExit = CrawlJob.STATUS_FINISHED;
1107 return false;
1108 }
1109
1110 if (maxBytes > 0 && frontier.totalBytesWritten() >= maxBytes) {
1111
1112 sExit = CrawlJob.STATUS_FINISHED_DATA_LIMIT;
1113 return false;
1114 } else if (maxDocument > 0
1115 && frontier.succeededFetchCount() >= maxDocument) {
1116
1117 this.sExit = CrawlJob.STATUS_FINISHED_DOCUMENT_LIMIT;
1118 return false;
1119 } else if (maxTime > 0 &&
1120 statistics.crawlDuration() >= maxTime * 1000) {
1121
1122 this.sExit = CrawlJob.STATUS_FINISHED_TIME_LIMIT;
1123 return false;
1124 }
1125 return state == RUNNING;
1126 }
1127
1128 /***
1129 * Request a checkpoint.
1130 * Sets a checkpointing thread running.
1131 * @throws IllegalStateException Thrown if crawl is not in paused state
1132 * (Crawl must be first paused before checkpointing).
1133 */
1134 public synchronized void requestCrawlCheckpoint()
1135 throws IllegalStateException {
1136 if (this.checkpointer == null) {
1137 return;
1138 }
1139 if (this.checkpointer.isCheckpointing()) {
1140 throw new IllegalStateException("Checkpoint already running.");
1141 }
1142 this.checkpointer.checkpoint();
1143 }
1144
1145 /***
1146 * @return True if checkpointing.
1147 */
1148 public boolean isCheckpointing() {
1149 return this.state == CHECKPOINTING;
1150 }
1151
1152 /***
1153 * Run checkpointing.
1154 * CrawlController takes care of managing the checkpointing/serializing
1155 * of bdb, the StatisticsTracker, and the CheckpointContext. Other
1156 * modules that want to revive themselves on checkpoint recovery need to
1157 * save state during their {@link CrawlStatusListener#crawlCheckpoint(File)}
1158 * invocation and then in their #initialize if a module,
1159 * or in their #initialTask if a processor, check with the CrawlController
1160 * if its checkpoint recovery. If it is, read in their old state from the
1161 * pointed to checkpoint directory.
1162 * <p>Default access only to be called by Checkpointer.
1163 * @throws Exception
1164 */
1165 void checkpoint()
1166 throws Exception {
1167
1168 sendCheckpointEvent(this.checkpointer.
1169 getCheckpointInProgressDirectory());
1170
1171
1172 LOGGER.fine("Rotating log files.");
1173 rotateLogFiles(CURRENT_LOG_SUFFIX + "." +
1174 this.checkpointer.getNextCheckpointName());
1175
1176
1177 LOGGER.fine("BigMaps.");
1178 checkpointBigMaps(this.checkpointer.getCheckpointInProgressDirectory());
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188 LOGGER.fine("Bdb environment.");
1189 checkpointBdb(this.checkpointer.getCheckpointInProgressDirectory());
1190
1191
1192 LOGGER.fine("Copying settings.");
1193 copySettings(this.checkpointer.getCheckpointInProgressDirectory());
1194
1195
1196 CheckpointUtils.writeObjectToFile(this,
1197 this.checkpointer.getCheckpointInProgressDirectory());
1198 }
1199
1200 /***
1201 * Copy off the settings.
1202 * @param checkpointDir Directory to write checkpoint to.
1203 * @throws IOException
1204 */
1205 protected void copySettings(final File checkpointDir) throws IOException {
1206 final List files = this.settingsHandler.getListOfAllFiles();
1207 boolean copiedSettingsDir = false;
1208 final File settingsDir = new File(this.disk, "settings");
1209 for (final Iterator i = files.iterator(); i.hasNext();) {
1210 File f = new File((String)i.next());
1211 if (f.getAbsolutePath().startsWith(settingsDir.getAbsolutePath())) {
1212 if (copiedSettingsDir) {
1213
1214
1215 continue;
1216 }
1217
1218 copiedSettingsDir = true;
1219 FileUtils.copyFiles(settingsDir,
1220 new File(checkpointDir, settingsDir.getName()));
1221 continue;
1222 }
1223 FileUtils.copyFiles(f, f.isDirectory()? checkpointDir:
1224 new File(checkpointDir, f.getName()));
1225 }
1226 }
1227
1228 /***
1229 * Checkpoint bdb.
1230 * I used do a call to log cleaning as suggested in je-2.0 javadoc but takes
1231 * way too much time (20minutes for a crawl of 1million items). Assume
1232 * cleaner is keeping up. Below was log cleaning loop .
1233 * <pre>int totalCleaned = 0;
1234 * for (int cleaned = 0; (cleaned = this.bdbEnvironment.cleanLog()) != 0;
1235 * totalCleaned += cleaned) {
1236 * LOGGER.fine("Cleaned " + cleaned + " log files.");
1237 * }
1238 * </pre>
1239 * <p>I also used to do a sync. But, from Mark Hayes, sync and checkpoint
1240 * are effectively same thing only sync is not configurable. He suggests
1241 * doing one or the other:
1242 * <p>MS: Reading code, Environment.sync() is a checkpoint. Looks like
1243 * I don't need to call a checkpoint after calling a sync?
1244 * <p>MH: Right, they're almost the same thing -- just do one or the other,
1245 * not both. With the new API, you'll need to do a checkpoint not a
1246 * sync, because the sync() method has no config parameter. Don't worry
1247 * -- it's fine to do a checkpoint even though you're not using.
1248 * @param checkpointDir Directory to write checkpoint to.
1249 * @throws DatabaseException
1250 * @throws IOException
1251 * @throws RuntimeException Thrown if failed setup of new bdb environment.
1252 */
1253 protected void checkpointBdb(File checkpointDir)
1254 throws DatabaseException, IOException, RuntimeException {
1255 EnvironmentConfig envConfig = this.bdbEnvironment.getConfig();
1256 final List bkgrdThreads = Arrays.asList(new String []
1257 {"je.env.runCheckpointer", "je.env.runCleaner",
1258 "je.env.runINCompressor"});
1259 try {
1260
1261 setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "false");
1262
1263 CheckpointConfig chkptConfig = new CheckpointConfig();
1264 chkptConfig.setForce(true);
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279 chkptConfig.setMinimizeRecoveryTime(true);
1280 this.bdbEnvironment.checkpoint(chkptConfig);
1281 LOGGER.fine("Finished bdb checkpoint.");
1282
1283
1284 EnvironmentImpl envImpl =
1285 DbInternal.envGetEnvironmentImpl(this.bdbEnvironment);
1286 long firstFileInNextSet =
1287 DbLsn.getFileNumber(envImpl.forceLogFileFlip());
1288
1289
1290 final String lastBdbCheckpointLog =
1291 getBdbLogFileName(firstFileInNextSet - 1);
1292 processBdbLogs(checkpointDir, lastBdbCheckpointLog);
1293 LOGGER.fine("Finished processing bdb log files.");
1294 } finally {
1295
1296 setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "true");
1297 }
1298 }
1299
1300 protected void processBdbLogs(final File checkpointDir,
1301 final String lastBdbCheckpointLog) throws IOException {
1302 File bdbDir = CheckpointUtils.getBdbSubDirectory(checkpointDir);
1303 if (!bdbDir.exists()) {
1304 bdbDir.mkdir();
1305 }
1306 PrintWriter pw = new PrintWriter(new FileOutputStream(new File(
1307 checkpointDir, "bdbje-logs-manifest.txt")));
1308 try {
1309
1310
1311 boolean pastLastLogFile = false;
1312 Set<String> srcFilenames = null;
1313 final boolean copyFiles = getCheckpointCopyBdbjeLogs();
1314 do {
1315 FilenameFilter filter = CheckpointUtils.getJeLogsFilter();
1316 srcFilenames =
1317 new HashSet<String>(Arrays.asList(
1318 getStateDisk().list(filter)));
1319 List tgtFilenames = Arrays.asList(bdbDir.list(filter));
1320 if (tgtFilenames != null && tgtFilenames.size() > 0) {
1321 srcFilenames.removeAll(tgtFilenames);
1322 }
1323 if (srcFilenames.size() > 0) {
1324
1325 srcFilenames = new TreeSet<String>(srcFilenames);
1326 int count = 0;
1327 for (final Iterator i = srcFilenames.iterator();
1328 i.hasNext() && !pastLastLogFile;) {
1329 String name = (String) i.next();
1330 if (copyFiles) {
1331 FileUtils.copyFiles(new File(getStateDisk(), name),
1332 new File(bdbDir, name));
1333 }
1334 pw.println(name);
1335 if (name.equals(lastBdbCheckpointLog)) {
1336
1337 pastLastLogFile = true;
1338 }
1339 count++;
1340 }
1341 if (LOGGER.isLoggable(Level.FINE)) {
1342 LOGGER.fine("Copied " + count);
1343 }
1344 }
1345 } while (!pastLastLogFile && srcFilenames != null &&
1346 srcFilenames.size() > 0);
1347 } finally {
1348 pw.close();
1349 }
1350 }
1351
1352 protected String getBdbLogFileName(final long index) {
1353 String lastBdbLogFileHex = Long.toHexString(index);
1354 StringBuffer buffer = new StringBuffer();
1355 for (int i = 0; i < (8 - lastBdbLogFileHex.length()); i++) {
1356 buffer.append('0');
1357 }
1358 buffer.append(lastBdbLogFileHex);
1359 buffer.append(".jdb");
1360 return buffer.toString();
1361 }
1362
1363 protected void setBdbjeBkgrdThreads(final EnvironmentConfig config,
1364 final List threads, final String setting) {
1365 for (final Iterator i = threads.iterator(); i.hasNext();) {
1366 config.setConfigParam((String)i.next(), setting);
1367 }
1368 }
1369
1370 /***
1371 * Get recover checkpoint.
1372 * Returns null if we're NOT in recover mode.
1373 * Looks at ATTR_RECOVER_PATH and if its a directory, assumes checkpoint
1374 * recover. If checkpoint mode, returns Checkpoint instance if
1375 * checkpoint was VALID (else null).
1376 * @return Checkpoint instance if we're in recover checkpoint
1377 * mode and the pointed-to checkpoint was valid.
1378 * @see #isCheckpointRecover()
1379 */
1380 public synchronized Checkpoint getCheckpointRecover() {
1381 if (this.checkpointRecover != null) {
1382 return this.checkpointRecover;
1383 }
1384 return getCheckpointRecover(this.order);
1385 }
1386
1387 public static Checkpoint getCheckpointRecover(final CrawlOrder order) {
1388 String path = (String)order.getUncheckedAttribute(null,
1389 CrawlOrder.ATTR_RECOVER_PATH);
1390 if (path == null || path.length() <= 0) {
1391 return null;
1392 }
1393 File rp = new File(path);
1394
1395 Checkpoint result = null;
1396 if (rp.exists() && rp.isDirectory()) {
1397 Checkpoint cp = new Checkpoint(rp);
1398 if (cp.isValid()) {
1399
1400 result = cp;
1401 }
1402 }
1403 return result;
1404 }
1405
1406 public static boolean isCheckpointRecover(final CrawlOrder order) {
1407 return getCheckpointRecover(order) != null;
1408 }
1409
1410 /***
1411 * @return True if we're in checkpoint recover mode. Call
1412 * {@link #getCheckpointRecover()} to get at Checkpoint instance
1413 * that has info on checkpoint directory being recovered from.
1414 */
1415 public boolean isCheckpointRecover() {
1416 return this.checkpointRecover != null;
1417 }
1418
1419 /***
1420 * Operator requested for crawl to stop.
1421 */
1422 public synchronized void requestCrawlStop() {
1423 requestCrawlStop(CrawlJob.STATUS_ABORTED);
1424 }
1425
1426 /***
1427 * Operator requested for crawl to stop.
1428 * @param message
1429 */
1430 public synchronized void requestCrawlStop(String message) {
1431 if (state == STOPPING || state == FINISHED) {
1432 return;
1433 }
1434 if (message == null) {
1435 throw new IllegalArgumentException("Message cannot be null.");
1436 }
1437 this.sExit = message;
1438 beginCrawlStop();
1439 }
1440
1441 /***
1442 * Start the process of stopping the crawl.
1443 */
1444 public void beginCrawlStop() {
1445 LOGGER.fine("Started.");
1446 sendCrawlStateChangeEvent(STOPPING, this.sExit);
1447 if (this.frontier != null) {
1448 this.frontier.terminate();
1449 this.frontier.unpause();
1450 }
1451 LOGGER.fine("Finished.");
1452 }
1453
1454 /***
1455 * Stop the crawl temporarly.
1456 */
1457 public synchronized void requestCrawlPause() {
1458 if (state == PAUSING || state == PAUSED) {
1459
1460 return;
1461 }
1462 sExit = CrawlJob.STATUS_WAITING_FOR_PAUSE;
1463 frontier.pause();
1464 sendCrawlStateChangeEvent(PAUSING, this.sExit);
1465 if (toePool.getActiveToeCount() == 0) {
1466
1467
1468 completePause();
1469 }
1470 }
1471
1472 /***
1473 * Tell if the controller is paused
1474 * @return true if paused
1475 */
1476 public boolean isPaused() {
1477 return state == PAUSED;
1478 }
1479
1480 public boolean isPausing() {
1481 return state == PAUSING;
1482 }
1483
1484 public boolean isRunning() {
1485 return state == RUNNING;
1486 }
1487
1488 /***
1489 * Resume crawl from paused state
1490 */
1491 public synchronized void requestCrawlResume() {
1492 if (state != PAUSING && state != PAUSED && state != CHECKPOINTING) {
1493
1494
1495 return;
1496 }
1497 multiThreadMode();
1498 frontier.unpause();
1499 LOGGER.fine("Crawl resumed.");
1500 sendCrawlStateChangeEvent(RUNNING, CrawlJob.STATUS_RUNNING);
1501 }
1502
1503 /***
1504 * @return Active toe thread count.
1505 */
1506 public int getActiveToeCount() {
1507 if (toePool == null) {
1508 return 0;
1509 }
1510 return toePool.getActiveToeCount();
1511 }
1512
1513 private void setupToePool() {
1514 toePool = new ToePool(this);
1515
1516 toePool.setSize(order.getMaxToes());
1517 }
1518
1519 /***
1520 * @return The order file instance.
1521 */
1522 public CrawlOrder getOrder() {
1523 return order;
1524 }
1525
1526 /***
1527 * @return The server cache instance.
1528 */
1529 public ServerCache getServerCache() {
1530 return serverCache;
1531 }
1532
1533 /***
1534 * @param o
1535 */
1536 public void setOrder(CrawlOrder o) {
1537 order = o;
1538 }
1539
1540
1541 /***
1542 * @return The frontier.
1543 */
1544 public Frontier getFrontier() {
1545 return frontier;
1546 }
1547
1548 /***
1549 * @return This crawl scope.
1550 */
1551 public CrawlScope getScope() {
1552 return scope;
1553 }
1554
1555 /*** Get the list of processor chains.
1556 *
1557 * @return the list of processor chains.
1558 */
1559 public ProcessorChainList getProcessorChainList() {
1560 return processorChains;
1561 }
1562
1563 /*** Get the first processor chain.
1564 *
1565 * @return the first processor chain.
1566 */
1567 public ProcessorChain getFirstProcessorChain() {
1568 return processorChains.getFirstChain();
1569 }
1570
1571 /*** Get the postprocessor chain.
1572 *
1573 * @return the postprocessor chain.
1574 */
1575 public ProcessorChain getPostprocessorChain() {
1576 return processorChains.getLastChain();
1577 }
1578
1579 /***
1580 * Get the 'working' directory of the current crawl.
1581 * @return the 'working' directory of the current crawl.
1582 */
1583 public File getDisk() {
1584 return disk;
1585 }
1586
1587 /***
1588 * @return Scratch disk location.
1589 */
1590 public File getScratchDisk() {
1591 return scratchDisk;
1592 }
1593
1594 /***
1595 * @return State disk location.
1596 */
1597 public File getStateDisk() {
1598 return stateDisk;
1599 }
1600
1601 /***
1602 * @return The number of ToeThreads
1603 *
1604 * @see ToePool#getToeCount()
1605 */
1606 public int getToeCount() {
1607 return this.toePool == null? 0: this.toePool.getToeCount();
1608 }
1609
1610 /***
1611 * @return The ToePool
1612 */
1613 public ToePool getToePool() {
1614 return toePool;
1615 }
1616
1617 /***
1618 * @return toepool one-line report
1619 */
1620 public String oneLineReportThreads() {
1621
1622 return toePool.singleLineReport();
1623 }
1624
1625 /***
1626 * While many settings will update automatically when the SettingsHandler is
1627 * modified, some settings need to be explicitly changed to reflect new
1628 * settings. This includes, number of toe threads and seeds.
1629 */
1630 public void kickUpdate() {
1631
1632 installThreadContextSettingsHandler();
1633
1634 toePool.setSize(order.getMaxToes());
1635
1636 this.scope.kickUpdate();
1637 this.frontier.kickUpdate();
1638 this.processorChains.kickUpdate();
1639
1640
1641
1642
1643 setThresholds();
1644 }
1645
1646 /***
1647 * @return The settings handler.
1648 */
1649 public SettingsHandler getSettingsHandler() {
1650 return settingsHandler;
1651 }
1652
1653 /***
1654 * This method iterates through processor chains to run processors' initial
1655 * tasks.
1656 *
1657 */
1658 private void runProcessorInitialTasks(){
1659 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1660 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1661 ip.hasNext(); ) {
1662 ((Processor) ip.next()).initialTasks();
1663 }
1664 }
1665 }
1666
1667 /***
1668 * This method iterates through processor chains to run processors' final
1669 * tasks.
1670 *
1671 */
1672 private void runProcessorFinalTasks(){
1673 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1674 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1675 ip.hasNext(); ) {
1676 ((Processor) ip.next()).finalTasks();
1677 }
1678 }
1679 }
1680
1681 /***
1682 * Kills a thread. For details see
1683 * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
1684 * ToePool.killThread(int, boolean)}.
1685 * @param threadNumber Thread to kill.
1686 * @param replace Should thread be replaced.
1687 * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
1688 */
1689 public void killThread(int threadNumber, boolean replace){
1690 toePool.killThread(threadNumber, replace);
1691 }
1692
1693 /***
1694 * Add a file to the manifest of files used/generated by the current
1695 * crawl.
1696 *
1697 * TODO: Its possible for a file to be added twice if reports are
1698 * force generated midcrawl. Fix.
1699 *
1700 * @param file The filename (with absolute path) of the file to add
1701 * @param type The type of the file
1702 * @param bundle Should the file be included in a typical bundling of
1703 * crawler files.
1704 *
1705 * @see #MANIFEST_CONFIG_FILE
1706 * @see #MANIFEST_LOG_FILE
1707 * @see #MANIFEST_REPORT_FILE
1708 */
1709 public void addToManifest(String file, char type, boolean bundle) {
1710 manifest.append(type + (bundle? "+": "-") + " " + file + "\n");
1711 }
1712
1713 /***
1714 * Evaluate if the crawl should stop because it is finished.
1715 */
1716 public void checkFinish() {
1717 if(atFinish()) {
1718 beginCrawlStop();
1719 }
1720 }
1721
1722 /***
1723 * Evaluate if the crawl should stop because it is finished,
1724 * without actually stopping the crawl.
1725 *
1726 * @return true if crawl is at a finish-possible state
1727 */
1728 public boolean atFinish() {
1729 return state == RUNNING && !shouldContinueCrawling();
1730 }
1731
1732 private void readObject(ObjectInputStream stream)
1733 throws IOException, ClassNotFoundException {
1734 stream.defaultReadObject();
1735
1736 this.registeredCrawlStatusListeners =
1737 Collections.synchronizedList(new ArrayList<CrawlStatusListener>());
1738
1739 singleThreadMode = false;
1740 }
1741
1742 /***
1743 * Go to single thread mode, where only one ToeThread may
1744 * proceed at a time. Also acquires the single lock, so
1745 * no further threads will proceed past an
1746 * acquireContinuePermission. Caller mush be sure to release
1747 * lock to allow other threads to proceed one at a time.
1748 */
1749 public void singleThreadMode() {
1750 this.singleThreadLock.lock();
1751 singleThreadMode = true;
1752 }
1753
1754 /***
1755 * Go to back to regular multi thread mode, where all
1756 * ToeThreads may proceed at once
1757 */
1758 public void multiThreadMode() {
1759 this.singleThreadLock.lock();
1760 singleThreadMode = false;
1761 while(this.singleThreadLock.isHeldByCurrentThread()) {
1762 this.singleThreadLock.unlock();
1763 }
1764 }
1765
1766 /***
1767 * Proceed only if allowed, giving CrawlController a chance
1768 * to enforce single-thread mode.
1769 */
1770 public void acquireContinuePermission() {
1771 if (singleThreadMode) {
1772 this.singleThreadLock.lock();
1773 if(!singleThreadMode) {
1774
1775 while(this.singleThreadLock.isHeldByCurrentThread()) {
1776 this.singleThreadLock.unlock();
1777 }
1778 }
1779 }
1780 }
1781
1782 /***
1783 * Relinquish continue permission at end of processing (allowing
1784 * another thread to proceed if in single-thread mode).
1785 */
1786 public void releaseContinuePermission() {
1787 if (singleThreadMode) {
1788 while(this.singleThreadLock.isHeldByCurrentThread()) {
1789 this.singleThreadLock.unlock();
1790 }
1791 }
1792 }
1793
1794 public void freeReserveMemory() {
1795 if(!reserveMemory.isEmpty()) {
1796 reserveMemory.removeLast();
1797 System.gc();
1798 }
1799 }
1800
1801 /***
1802 * Note that a ToeThread reached paused condition, possibly
1803 * completing the crawl-pause.
1804 */
1805 public synchronized void toePaused() {
1806 releaseContinuePermission();
1807 if (state == PAUSING && toePool.getActiveToeCount() == 0) {
1808 completePause();
1809 }
1810 }
1811
1812 /***
1813 * Note that a ToeThread ended, possibly completing the crawl-stop.
1814 */
1815 public synchronized void toeEnded() {
1816 if (state == STOPPING && toePool.getActiveToeCount() == 0) {
1817 completeStop();
1818 }
1819 }
1820
1821 /***
1822 * Add order file contents to manifest.
1823 * Write configuration files and any files managed by CrawlController to
1824 * it - files managed by other classes, excluding the settings framework,
1825 * are responsible for adding their files to the manifest themselves.
1826 * by calling addToManifest.
1827 * Call before writing out reports.
1828 */
1829 public void addOrderToManifest() {
1830 for (Iterator it = getSettingsHandler().getListOfAllFiles().iterator();
1831 it.hasNext();) {
1832 addToManifest((String)it.next(),
1833 CrawlController.MANIFEST_CONFIG_FILE, true);
1834 }
1835 }
1836
1837 /***
1838 * Log a URIException from deep inside other components to the crawl's
1839 * shared log.
1840 *
1841 * @param e URIException encountered
1842 * @param u CrawlURI where problem occurred
1843 * @param l String which could not be interpreted as URI without exception
1844 */
1845 public void logUriError(URIException e, UURI u, CharSequence l) {
1846 if (e.getReasonCode() == UURIFactory.IGNORED_SCHEME) {
1847
1848 return;
1849 }
1850 Object[] array = {u, l};
1851 uriErrors.log(Level.INFO, e.getMessage(), array);
1852 }
1853
1854
1855
1856
1857 public final static String PROCESSORS_REPORT = "processors";
1858 public final static String MANIFEST_REPORT = "manifest";
1859 protected final static String[] REPORTS = {PROCESSORS_REPORT, MANIFEST_REPORT};
1860
1861
1862
1863
1864 public String[] getReports() {
1865 return REPORTS;
1866 }
1867
1868
1869
1870
1871 public void reportTo(PrintWriter writer) {
1872 reportTo(null,writer);
1873 }
1874
1875 public String singleLineReport() {
1876 return ArchiveUtils.singleLineReport(this);
1877 }
1878
1879 public void reportTo(String name, PrintWriter writer) {
1880 if(PROCESSORS_REPORT.equals(name)) {
1881 reportProcessorsTo(writer);
1882 return;
1883 } else if (MANIFEST_REPORT.equals(name)) {
1884 reportManifestTo(writer);
1885 return;
1886 } else if (name!=null) {
1887 writer.println("requested report unknown: "+name);
1888 }
1889 singleLineReportTo(writer);
1890 }
1891
1892 /***
1893 * @param writer Where to write report to.
1894 */
1895 protected void reportManifestTo(PrintWriter writer) {
1896 writer.print(manifest.toString());
1897 }
1898
1899 /***
1900 * Compiles and returns a human readable report on the active processors.
1901 * @param writer Where to write to.
1902 * @see org.archive.crawler.framework.Processor#report()
1903 */
1904 protected void reportProcessorsTo(PrintWriter writer) {
1905 writer.print(
1906 "Processors report - "
1907 + ArchiveUtils.get12DigitDate()
1908 + "\n");
1909 writer.print(" Job being crawled: " + getOrder().getCrawlOrderName()
1910 + "\n");
1911
1912 writer.print(" Number of Processors: " +
1913 processorChains.processorCount() + "\n");
1914 writer.print(" NOTE: Some processors may not return a report!\n\n");
1915
1916 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1917 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1918 ip.hasNext(); ) {
1919 writer.print(((Processor) ip.next()).report());
1920 }
1921 }
1922 }
1923
1924 public void singleLineReportTo(PrintWriter writer) {
1925
1926 writer.write("[Crawl Controller]\n");
1927 }
1928
1929 public String singleLineLegend() {
1930
1931 return "nothingYet";
1932 }
1933
1934 /***
1935 * Call this method to get instance of the crawler BigMap implementation.
1936 * A "BigMap" is a Map that knows how to manage ever-growing sets of
1937 * key/value pairs. If we're in a checkpoint recovery, this method will
1938 * manage reinstantiation of checkpointed bigmaps.
1939 * @param dbName Name to give any associated database. Also used
1940 * as part of name serializing out bigmap. Needs to be unique to a crawl.
1941 * @param keyClass Class of keys we'll be using.
1942 * @param valueClass Class of values we'll be using.
1943 * @return Map that knows how to carry large sets of key/value pairs or
1944 * if none available, returns instance of HashMap.
1945 * @throws Exception
1946 */
1947 public <K,V> Map<K,V> getBigMap(final String dbName,
1948 final Class<? super K> keyClass,
1949 final Class<? super V> valueClass)
1950 throws Exception {
1951 CachedBdbMap<K,V> result = new CachedBdbMap<K,V>(dbName);
1952 if (isCheckpointRecover()) {
1953 File baseDir = getCheckpointRecover().getDirectory();
1954 @SuppressWarnings("unchecked")
1955 CachedBdbMap<K,V> temp = CheckpointUtils.
1956 readObjectFromFile(result.getClass(), dbName, baseDir);
1957 result = temp;
1958 }
1959 result.initialize(getBdbEnvironment(), keyClass, valueClass,
1960 getBdbEnvironment().getClassCatalog());
1961
1962
1963 this.bigmaps.put(dbName, result);
1964 return result;
1965 }
1966
1967 protected void checkpointBigMaps(final File cpDir)
1968 throws Exception {
1969 for (final Iterator i = this.bigmaps.keySet().iterator(); i.hasNext();) {
1970 Object key = i.next();
1971 Object obj = this.bigmaps.get(key);
1972
1973
1974
1975
1976 ((CachedBdbMap)obj).sync();
1977 CheckpointUtils.writeObjectToFile(obj, (String)key, cpDir);
1978 }
1979 }
1980
1981 /***
1982 * Called whenever progress statistics logging event.
1983 * @param e Progress statistics event.
1984 */
1985 public void progressStatisticsEvent(final EventObject e) {
1986
1987
1988
1989
1990 }
1991
1992 /***
1993 * Log to the progress statistics log.
1994 * @param msg Message to write the progress statistics log.
1995 */
1996 public void logProgressStatistics(final String msg) {
1997 this.progressStats.info(msg);
1998 }
1999
2000 /***
2001 * @return CrawlController state.
2002 */
2003 public Object getState() {
2004 return this.state;
2005 }
2006
2007 public File getCheckpointsDisk() {
2008 return this.checkpointsDisk;
2009 }
2010 }