1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler;
26
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.io.PrintStream;
34 import java.io.PrintWriter;
35 import java.net.HttpURLConnection;
36 import java.net.InetAddress;
37 import java.net.URL;
38 import java.net.URLConnection;
39 import java.net.UnknownHostException;
40 import java.util.ArrayList;
41 import java.util.Arrays;
42 import java.util.Collection;
43 import java.util.Collections;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.Iterator;
47 import java.util.List;
48 import java.util.Map;
49 import java.util.Properties;
50 import java.util.StringTokenizer;
51 import java.util.TimeZone;
52 import java.util.Vector;
53 import java.util.logging.Level;
54 import java.util.logging.LogManager;
55 import java.util.logging.Logger;
56
57 import javax.management.Attribute;
58 import javax.management.AttributeList;
59 import javax.management.AttributeNotFoundException;
60 import javax.management.DynamicMBean;
61 import javax.management.InstanceAlreadyExistsException;
62 import javax.management.InstanceNotFoundException;
63 import javax.management.InvalidAttributeValueException;
64 import javax.management.MBeanInfo;
65 import javax.management.MBeanNotificationInfo;
66 import javax.management.MBeanOperationInfo;
67 import javax.management.MBeanRegistration;
68 import javax.management.MBeanRegistrationException;
69 import javax.management.MBeanServer;
70 import javax.management.MBeanServerFactory;
71 import javax.management.MalformedObjectNameException;
72 import javax.management.NotCompliantMBeanException;
73 import javax.management.ObjectName;
74 import javax.management.ReflectionException;
75 import javax.management.RuntimeOperationsException;
76 import javax.management.openmbean.CompositeData;
77 import javax.management.openmbean.CompositeDataSupport;
78 import javax.management.openmbean.CompositeType;
79 import javax.management.openmbean.OpenDataException;
80 import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
81 import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
82 import javax.management.openmbean.OpenMBeanInfoSupport;
83 import javax.management.openmbean.OpenMBeanOperationInfoSupport;
84 import javax.management.openmbean.OpenMBeanParameterInfo;
85 import javax.management.openmbean.OpenMBeanParameterInfoSupport;
86 import javax.management.openmbean.OpenType;
87 import javax.management.openmbean.SimpleType;
88 import javax.management.openmbean.TabularData;
89 import javax.management.openmbean.TabularDataSupport;
90 import javax.management.openmbean.TabularType;
91 import javax.naming.CompoundName;
92 import javax.naming.Context;
93 import javax.naming.NameNotFoundException;
94 import javax.naming.NamingException;
95 import javax.naming.NoInitialContextException;
96
97 import org.apache.commons.cli.Option;
98 import org.archive.crawler.admin.CrawlJob;
99 import org.archive.crawler.admin.CrawlJobErrorHandler;
100 import org.archive.crawler.admin.CrawlJobHandler;
101 import org.archive.crawler.datamodel.CredentialStore;
102 import org.archive.crawler.datamodel.credential.Credential;
103 import org.archive.crawler.event.CrawlStatusListener;
104 import org.archive.crawler.framework.AlertManager;
105 import org.archive.crawler.framework.CrawlController;
106 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
107 import org.archive.crawler.framework.exceptions.InitializationException;
108 import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
109 import org.archive.crawler.settings.XMLSettingsHandler;
110 import org.archive.io.SinkHandler;
111 import org.archive.io.SinkHandlerLogRecord;
112 import org.archive.net.UURI;
113 import org.archive.util.FileUtils;
114 import org.archive.util.IoUtils;
115 import org.archive.util.JmxUtils;
116 import org.archive.util.JndiUtils;
117 import org.archive.util.PropertyUtils;
118 import org.archive.util.TextUtils;
119
120 import sun.net.www.protocol.file.FileURLConnection;
121
122
123 /***
124 * Main class for Heritrix crawler.
125 *
126 * Heritrix is usually launched by a shell script that backgrounds heritrix
127 * that redirects all stdout and stderr emitted by heritrix to a log file. So
128 * that startup messages emitted subsequent to the redirection of stdout and
129 * stderr show on the console, this class prints usage or startup output
130 * such as where the web UI can be found, etc., to a STARTLOG that the shell
131 * script is waiting on. As soon as the shell script sees output in this file,
132 * it prints its content and breaks out of its wait.
133 * See ${HERITRIX_HOME}/bin/heritrix.
134 *
135 * <p>Heritrix can also be embedded or launched by webapp initialization or
136 * by JMX bootstrapping. So far I count 4 methods of instantiation:
137 * <ol>
138 * <li>From this classes main -- the method usually used;</li>
139 * <li>From the Heritrix UI (The local-instances.jsp) page;</li>
140 * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
141 * <li>A container such as tomcat or jboss.</li>
142 * </ol>
143 *
144 * @author gojomo
145 * @author Kristinn Sigurdsson
146 * @author Stack
147 */
148 public class Heritrix implements DynamicMBean, MBeanRegistration {
149 /***
150 * Heritrix logging instance.
151 */
152 private static final Logger logger =
153 Logger.getLogger(Heritrix.class.getName());
154
155 private static final File TMPDIR =
156 new File(System.getProperty("java.io.tmpdir", "/tmp"));
157
158 /***
159 * Name of the heritrix properties file.
160 */
161 private static final String PROPERTIES = "heritrix.properties";
162
163 /***
164 * Name of the key to use specifying alternate heritrix properties on
165 * command line.
166 */
167 private static final String PROPERTIES_KEY = PROPERTIES;
168
169 /***
170 * Prefix used on our properties we'll add to the System.properties list.
171 */
172 private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
173
174 /***
175 * Prefix used on other properties we'll add to the System.properties
176 * list (after stripping this prefix).
177 */
178 private static final String SYSTEM_PREFIX = "system.";
179
180 /***
181 * Instance of web server if one was started.
182 */
183 private static SimpleHttpServer httpServer = null;
184
185 /***
186 * CrawlJob handler. Manages multiple crawl jobs at runtime.
187 */
188 private CrawlJobHandler jobHandler = null;
189
190 /***
191 * Heritrix start log file.
192 *
193 * This file contains standard out produced by this main class for startup
194 * only. Used by heritrix shell script. Name here MUST match that in the
195 * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
196 * wrapper has on this here java heritrix.
197 */
198 private static final String STARTLOG = "heritrix_dmesg.log";
199
200 /***
201 * Default encoding.
202 *
203 * Used for content when fetching if none specified.
204 */
205 public static final String DEFAULT_ENCODING = "ISO-8859-1";
206
207 /***
208 * Heritrix stderr/stdout log file.
209 *
210 * This file should have nothing in it except messages over which we have
211 * no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
212 * startup script directs stderr/stdout here. This is an INTERDEPENDENCY
213 * this program has with the wrapper shell script. Shell can actually
214 * pass us an alternate to use for this file.
215 */
216 private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
217
218 /***
219 * Where to write this classes startup output.
220 *
221 * This out should only be used if Heritrix is being run from the
222 * command-line.
223 */
224 private static PrintWriter out = null;
225
226 /***
227 * The org.archive package
228 */
229 private static final String ARCHIVE_PACKAGE = "org.archive.";
230
231 /***
232 * The crawler package.
233 */
234 private static final String CRAWLER_PACKAGE = Heritrix.class.getName().
235 substring(0, Heritrix.class.getName().lastIndexOf('.'));
236
237 /***
238 * The root context for a webapp.
239 */
240 private static final String ROOT_CONTEXT = "/";
241
242 /***
243 * Set to true if application is started from command line.
244 */
245 private static boolean commandLine = false;
246
247 /***
248 * True if container initialization has been run.
249 */
250 private static boolean containerInitialized = false;
251
252 /***
253 * True if properties have been loaded.
254 */
255 private static boolean propertiesLoaded = false;
256
257 private static final String JAR_SUFFIX = ".jar";
258
259 private AlertManager alertManager;
260
261 /***
262 * The context of the GUI webapp. Default is root.
263 */
264 private static String adminContext = ROOT_CONTEXT;
265
266 /***
267 * True if we're to put up a GUI.
268 * Cmdline processing can override.
269 */
270 private static boolean gui =
271 !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui");
272
273 /***
274 * Port to put the GUI up on.
275 * Cmdline processing can override.
276 */
277 private static int guiPort = SimpleHttpServer.DEFAULT_PORT;
278
279
280 /***
281 * A collection containing only localhost. Used as default value
282 * for guiHosts, and passed to SimpleHttpServer when doing selftest.
283 */
284 final private static Collection<String> LOCALHOST_ONLY =
285 Collections.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" }));
286
287
288 /***
289 * Hosts to bind the GUI webserver to.
290 * By default, only contans localhost.
291 * Set to an empty collection to indicate that all available network
292 * interfaces should be used for the webserver.
293 */
294 private static Collection<String> guiHosts = LOCALHOST_ONLY;
295
296
297 /***
298 * Web UI server, realm, context name.
299 */
300 private static String ADMIN = "admin";
301
302
303 /***
304 * The MBean server we're registered with (May be null).
305 */
306 private MBeanServer mbeanServer = null;
307
308 /***
309 * MBean name we were registered as.
310 */
311 private ObjectName mbeanName = null;
312
313 /***
314 * Keep reference to all instances of Heritrix.
315 * Used by the UI to figure which of the local Heritrice it should
316 * be going against and to figure what to shutdown on the way out (If
317 * there was always a JMX Agent, we wouldn't need to keep this list. We
318 * could always ask the JMX Agent for all instances. UPDATE: True we could
319 * always ask the JMX Agent but we might keep around this local reference
320 * because it will allow faster, less awkward -- think of marshalling the args
321 * for JMX invoke operation -- access to local Heritrix instances. A new
322 * usage for this instances Map is in CrawlJob#preRegister to find the hosting
323 * Heritrix instance).
324 */
325 private static Map<String,Heritrix> instances
326 = new Hashtable<String,Heritrix>();
327
328 private OpenMBeanInfoSupport openMBeanInfo;
329 private final static String STATUS_ATTR = "Status";
330 private final static String VERSION_ATTR = "Version";
331 private final static String ISRUNNING_ATTR = "IsRunning";
332 private final static String ISCRAWLING_ATTR = "IsCrawling";
333 private final static String ALERTCOUNT_ATTR = "AlertCount";
334 private final static String NEWALERTCOUNT_ATTR = "NewAlertCount";
335 private final static String CURRENTJOB_ATTR = "CurrentJob";
336 private final static List ATTRIBUTE_LIST;
337 static {
338 ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR,
339 VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,
340 ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR});
341 }
342
343 private final static String START_OPER = "start";
344 private final static String STOP_OPER = "stop";
345 private final static String DESTROY_OPER = "destroy";
346 private final static String INTERRUPT_OPER = "interrupt";
347 private final static String START_CRAWLING_OPER = "startCrawling";
348 private final static String STOP_CRAWLING_OPER = "stopCrawling";
349 private final static String ADD_CRAWL_JOB_OPER = "addJob";
350 private final static String TERMINATE_CRAWL_JOB_OPER =
351 "terminateCurrentJob";
352 private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";
353 private final static String ALERT_OPER = "alert";
354 private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";
355 private final static String PENDING_JOBS_OPER = "pendingJobs";
356 private final static String COMPLETED_JOBS_OPER = "completedJobs";
357 private final static String CRAWLEND_REPORT_OPER = "crawlendReport";
358 private final static String SHUTDOWN_OPER = "shutdown";
359 private final static String LOG_OPER = "log";
360 private final static String REBIND_JNDI_OPER = "rebindJNDI";
361 private final static List OPERATION_LIST;
362 static {
363 OPERATION_LIST = Arrays.asList(new String [] {START_OPER, STOP_OPER,
364 INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER,
365 ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER,
366 DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER,
367 COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER,
368 LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,
369 REBIND_JNDI_OPER});
370 }
371 private CompositeType jobCompositeType = null;
372 private TabularType jobsTabularType = null;
373 private static final String [] JOB_KEYS =
374 new String [] {"uid", "name", "status"};
375
376 private static String adminUsername;
377
378 private static String adminPassword;
379
380 /***
381 * Constructor.
382 * Does not register the created instance with JMX. Assumed this
383 * constructor is used by such as JMX agent creating an instance of
384 * Heritrix at the commmand of a remote client (In this case Heritrix will
385 * be registered by the invoking agent).
386 * @throws IOException
387 */
388 public Heritrix() throws IOException {
389 this(null, false);
390 }
391
392 public Heritrix(final boolean jmxregister) throws IOException {
393 this(null, jmxregister);
394 }
395
396 /***
397 * Constructor.
398 * @param name If null, we bring up the default Heritrix instance.
399 * @param jmxregister True if we are to register this instance with JMX
400 * agent.
401 * @throws IOException
402 */
403 public Heritrix(final String name, final boolean jmxregister)
404 throws IOException {
405 this(name, jmxregister, new CrawlJobHandler(getJobsdir()));
406 }
407
408 /***
409 * Constructor.
410 * @param name If null, we bring up the default Heritrix instance.
411 * @param jmxregister True if we are to register this instance with JMX
412 * agent.
413 * @param cjh CrawlJobHandler to use.
414 * @throws IOException
415 */
416 public Heritrix(final String name, final boolean jmxregister,
417 final CrawlJobHandler cjh)
418 throws IOException {
419 super();
420 containerInitialization();
421 this.jobHandler = cjh;
422 this.openMBeanInfo = buildMBeanInfo();
423
424
425
426 final SinkHandler sinkHandler = SinkHandler.getInstance();
427 if (sinkHandler == null) {
428 throw new NullPointerException("SinkHandler not found.");
429 }
430
431 this.alertManager = new AlertManager() {
432 public void add(SinkHandlerLogRecord record) {
433 sinkHandler.publish(record);
434 }
435
436 public Vector getAll() {
437 return sinkHandler.getAll();
438 }
439
440 public Vector getNewAll() {
441 return sinkHandler.getAllUnread();
442 }
443
444 public SinkHandlerLogRecord get(String alertID) {
445 return sinkHandler.get(Long.parseLong(alertID));
446 }
447
448 public int getCount() {
449 return sinkHandler.getCount();
450 }
451
452 public int getNewCount() {
453 return sinkHandler.getUnreadCount();
454 }
455
456 public void remove(String alertID) {
457 sinkHandler.remove(Long.parseLong(alertID));
458 }
459
460 public void read(String alertID) {
461 sinkHandler.read(Long.parseLong(alertID));
462 }
463 };
464
465 try {
466 Heritrix.registerHeritrix(this, name, jmxregister);
467 } catch (InstanceAlreadyExistsException e) {
468 throw new RuntimeException(e);
469 } catch (MBeanRegistrationException e) {
470 throw new RuntimeException(e);
471 } catch (NotCompliantMBeanException e) {
472 throw new RuntimeException(e);
473 } catch (MalformedObjectNameException e) {
474 throw new RuntimeException(e);
475 }
476 }
477
478 /***
479 * Run setup tasks for this 'container'. Idempotent.
480 *
481 * @throws IOException
482 */
483 protected static void containerInitialization() throws IOException {
484 if (Heritrix.containerInitialized) {
485 return;
486 }
487 Heritrix.containerInitialized = true;
488
489
490
491
492 Heritrix.loadProperties();
493 Heritrix.patchLogging();
494 Heritrix.configureTrustStore();
495
496
497
498 Runtime.getRuntime().addShutdownHook(
499 Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook"));
500
501
502 try {
503 registerContainerJndi();
504 } catch (Exception e) {
505 logger.log(Level.WARNING, "Failed jndi container registration.", e);
506 }
507 }
508
509 /***
510 * Do inverse of construction. Used by anyone who does a 'new Heritrix' when
511 * they want to cleanup the instance.
512 * Of note, there may be Heritrix threads still hanging around after the
513 * call to destroy completes. They'll eventually go down after they've
514 * finished their cleanup routines. In particular, if you are watching
515 * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister
516 * ahead of the CrawlJob JMX bean that its hosting.
517 */
518 public void destroy() {
519 stop();
520 try {
521 Heritrix.unregisterHeritrix(this);
522 } catch (InstanceNotFoundException e) {
523 e.printStackTrace();
524 } catch (MBeanRegistrationException e) {
525 e.printStackTrace();
526 } catch (NullPointerException e) {
527 e.printStackTrace();
528 }
529 this.jobHandler = null;
530 this.openMBeanInfo = null;
531 }
532
533 /***
534 * Launch program.
535 * Optionally will launch a web server to host UI. Will also register
536 * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM
537 * Agent).
538 *
539 * @param args Command line arguments.
540 * @throws Exception
541 */
542 public static void main(String[] args)
543 throws Exception {
544 Heritrix.commandLine = true;
545
546
547
548 TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
549
550 File startLog = new File(getHeritrixHome(), STARTLOG);
551 Heritrix.out = new PrintWriter(isDevelopment()?
552 System.out: new PrintStream(new FileOutputStream(startLog)));
553
554 try {
555 containerInitialization();
556 String status = doCmdLineArgs(args);
557 if (status != null) {
558 Heritrix.out.println(status);
559 }
560 }
561
562 catch(Exception e) {
563
564 e.printStackTrace(Heritrix.out);
565 throw e;
566 }
567
568 finally {
569
570
571
572 if (!isDevelopment()) {
573 if (Heritrix.out != null) {
574 Heritrix.out.close();
575 }
576 System.out.println("Heritrix version: " +
577 Heritrix.getVersion());
578 } else {
579 if (Heritrix.out != null) {
580 Heritrix.out.flush();
581 }
582 }
583 }
584 }
585
586 protected static String doCmdLineArgs(final String [] args)
587 throws Exception {
588
589 String tmpStr = PropertyUtils.
590 getPropertyOrNull("heritrix.context");
591 if (tmpStr != null) {
592 Heritrix.adminContext = tmpStr;
593 }
594 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port");
595 if (tmpStr != null) {
596 Heritrix.guiPort = Integer.parseInt(tmpStr);
597 }
598 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin");
599 String adminLoginPassword = (tmpStr == null)? "": tmpStr;
600 String crawlOrderFile =
601 PropertyUtils.getPropertyOrNull("heritrix.cmdline.order");
602 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run");
603 boolean runMode =
604 PropertyUtils.getBooleanProperty("heritrix.cmdline.run");
605 boolean selfTest = false;
606 String selfTestName = null;
607 CommandLineParser clp = new CommandLineParser(args, Heritrix.out,
608 Heritrix.getVersion());
609 List arguments = clp.getCommandLineArguments();
610 Option [] options = clp.getCommandLineOptions();
611
612
613
614 if (arguments.size() > 1) {
615 clp.usage(1);
616 } else if (arguments.size() == 1) {
617 crawlOrderFile = (String)arguments.get(0);
618 if (!(new File(crawlOrderFile).exists())) {
619 clp.usage("ORDER.XML <" + crawlOrderFile +
620 "> specified does not exist.", 1);
621 }
622
623 if (crawlOrderFile.length() > 4 &&
624 !crawlOrderFile.substring(crawlOrderFile.length() - 4).
625 equalsIgnoreCase(".xml")) {
626 clp.usage("ORDER.XML <" + crawlOrderFile +
627 "> does not have required '.xml' suffix.", 1);
628 }
629 }
630
631
632 for (int i = 0; i < options.length; i++) {
633 switch(options[i].getId()) {
634 case 'h':
635 clp.usage();
636 break;
637
638 case 'a':
639 adminLoginPassword = options[i].getValue();
640 break;
641
642 case 'n':
643 if (crawlOrderFile == null) {
644 clp.usage("You must specify an ORDER_FILE with" +
645 " '--nowui' option.", 1);
646 }
647 Heritrix.gui = false;
648 break;
649
650 case 'b':
651 Heritrix.guiHosts = parseHosts(options[i].getValue());
652 break;
653
654 case 'p':
655 try {
656 Heritrix.guiPort =
657 Integer.parseInt(options[i].getValue());
658 } catch (NumberFormatException e) {
659 clp.usage("Failed parse of port number: " +
660 options[i].getValue(), 1);
661 }
662 if (Heritrix.guiPort <= 0) {
663 clp.usage("Nonsensical port number: " +
664 options[i].getValue(), 1);
665 }
666 break;
667
668 case 'r':
669 runMode = true;
670 break;
671
672 case 's':
673 selfTestName = options[i].getValue();
674 selfTest = true;
675 break;
676
677 default:
678 assert false: options[i].getId();
679 }
680 }
681
682
683 String status = null;
684 if (selfTest) {
685
686
687
688 for (int i = 0; i < options.length; i++) {
689 if (options[i].getId() != 'p' && options[i].getId() != 's') {
690 clp.usage(1);
691 }
692 }
693
694 if (arguments.size() > 0) {
695
696 clp.usage(1);
697 }
698 status = selftest(selfTestName, Heritrix.guiPort);
699 } else {
700 if (!Heritrix.gui) {
701 if (options.length > 1) {
702
703
704
705 clp.usage(1);
706 }
707 Heritrix h = new Heritrix(true);
708 status = h.doOneCrawl(crawlOrderFile);
709 } else {
710 if (!isValidLoginPasswordString(adminLoginPassword)) {
711
712 clp.usage("Invalid admin login:password value, or none "
713 + "specified. ", 1);
714 }
715 status = startEmbeddedWebserver(
716 Heritrix.guiHosts, Heritrix.guiPort,
717 adminLoginPassword);
718 Heritrix h = new Heritrix(true);
719
720 String tmp = h.launch(crawlOrderFile, runMode);
721 if (tmp != null) {
722 status += ('\n' + tmp);
723 }
724 }
725 }
726 return status;
727 }
728
729 /***
730 * @return The file we dump stdout and stderr into.
731 */
732 public static String getHeritrixOut() {
733 String tmp = System.getProperty("heritrix.out");
734 if (tmp == null || tmp.length() == 0) {
735 tmp = Heritrix.DEFAULT_HERITRIX_OUT;
736 }
737 return tmp;
738 }
739
740 /***
741 * Exploit <code>-Dheritrix.home</code> if available to us.
742 * Is current working dir if no heritrix.home property supplied.
743 * @return Heritrix home directory.
744 * @throws IOException
745 */
746 protected static File getHeritrixHome()
747 throws IOException {
748 File heritrixHome = null;
749 String home = System.getProperty("heritrix.home");
750 if (home != null && home.length() > 0) {
751 heritrixHome = new File(home);
752 if (!heritrixHome.exists()) {
753 throw new IOException("HERITRIX_HOME <" + home +
754 "> does not exist.");
755 }
756 } else {
757 heritrixHome = new File(new File("").getAbsolutePath());
758 }
759 return heritrixHome;
760 }
761
762 /***
763 * @return The directory into which we put jobs. If the system property
764 * 'heritrix.jobsdir' is set, we will use its value in place of the default
765 * 'jobs' directory in the current working directory.
766 * @throws IOException
767 */
768 public static File getJobsdir() throws IOException {
769 Heritrix.loadProperties();
770 String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs");
771 File jobsdir = new File(jobsdirStr);
772 return (jobsdir.isAbsolute())?
773 jobsdir:
774 new File(getHeritrixHome(), jobsdirStr);
775 }
776
777 /***
778 * Get and check for existence of expected subdir.
779 *
780 * If development flag set, then look for dir under src dir.
781 *
782 * @param subdirName Dir to look for.
783 * @return The extant subdir. Otherwise null if we're running
784 * in a webapp context where there is no conf directory available.
785 * @throws IOException if unable to find expected subdir.
786 */
787 protected static File getSubDir(String subdirName)
788 throws IOException {
789 return getSubDir(subdirName, true);
790 }
791
792 /***
793 * Get and optionally check for existence of subdir.
794 *
795 * If development flag set, then look for dir under src dir.
796 *
797 * @param subdirName Dir to look for.
798 * @param fail True if we are to fail if directory does not
799 * exist; false if we are to return false if the directory does not exist.
800 * @return The extant subdir. Otherwise null if we're running
801 * in a webapp context where there is no subdir directory available.
802 * @throws IOException if unable to find expected subdir.
803 */
804 protected static File getSubDir(String subdirName, boolean fail)
805 throws IOException {
806 String path = isDevelopment()?
807 "src" + File.separator + subdirName:
808 subdirName;
809 File dir = new File(getHeritrixHome(), path);
810 if (!dir.exists()) {
811 if (fail) {
812 throw new IOException("Cannot find subdir: " + subdirName);
813 }
814 dir = null;
815 }
816 return dir;
817 }
818
819 /***
820 * Test string is valid login/password string.
821 *
822 * A valid login/password string has the login and password compounded
823 * w/ a ':' delimiter.
824 *
825 * @param str String to test.
826 * @return True if valid password/login string.
827 */
828 protected static boolean isValidLoginPasswordString(String str) {
829 boolean isValid = false;
830 StringTokenizer tokenizer = new StringTokenizer(str, ":");
831 if (tokenizer.countTokens() == 2) {
832 String login = ((String)tokenizer.nextElement()).trim();
833 String password = ((String)tokenizer.nextElement()).trim();
834 if (login.length() > 0 && password.length() > 0) {
835 isValid = true;
836 }
837 }
838 return isValid;
839 }
840
841 protected static boolean isDevelopment() {
842 return System.getProperty("heritrix.development") != null;
843 }
844
845 /***
846 * Load the heritrix.properties file.
847 *
848 * Adds any property that starts with
849 * <code>HERITRIX_PROPERTIES_PREFIX</code>
850 * or <code>ARCHIVE_PACKAGE</code>
851 * into system properties (except logging '.level' directives).
852 * @return Loaded properties.
853 * @throws IOException
854 */
855 protected static Properties loadProperties()
856 throws IOException {
857 if (Heritrix.propertiesLoaded) {
858 return System.getProperties();
859 }
860 Heritrix.propertiesLoaded = true;
861
862 Properties properties = new Properties();
863 properties.load(getPropertiesInputStream());
864
865
866
867
868
869 for (Enumeration e = properties.keys(); e.hasMoreElements();) {
870 String key = ((String)e.nextElement()).trim();
871 if (key.startsWith(ARCHIVE_PACKAGE) ||
872 key.startsWith(HERITRIX_PROPERTIES_PREFIX)) {
873
874
875 String value = properties.getProperty(key).trim();
876 if (key.indexOf(".level") < 0) {
877 copyToSystemProperty(key, value);
878 }
879 } else if (key.startsWith(SYSTEM_PREFIX)) {
880 String value = properties.getProperty(key).trim();
881 copyToSystemProperty(key.substring(SYSTEM_PREFIX.length()), value);
882 }
883 }
884 return properties;
885 }
886
887 /***
888 * Copy the given key-value into System properties, as long as there
889 * is no existing value.
890 * @param key property key
891 * @param value property value
892 */
893 protected static void copyToSystemProperty(String key, String value) {
894 if (System.getProperty(key) == null ||
895 System.getProperty(key).length() == 0) {
896 System.setProperty(key, value);
897 }
898 }
899
900 protected static InputStream getPropertiesInputStream()
901 throws IOException {
902 File file = null;
903
904 String alternateProperties = System.getProperty(PROPERTIES_KEY);
905 if (alternateProperties != null && alternateProperties.length() > 0) {
906 file = new File(alternateProperties);
907 }
908
909 if ((file == null || !file.exists()) && getConfdir(false) != null) {
910 file = new File(getConfdir(), PROPERTIES);
911 if (!file.exists()) {
912
913
914 file = null;
915 }
916 }
917
918
919
920 InputStream is = (file != null)?
921 new FileInputStream(file):
922 Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY);
923 if (is == null) {
924 throw new IOException("Failed to load properties file from" +
925 " filesystem or from classpath.");
926 }
927 return is;
928 }
929
930 /***
931 * If the user hasn't altered the default logging parameters, tighten them
932 * up somewhat: some of our libraries are way too verbose at the INFO or
933 * WARNING levels.
934 *
935 * This might be a problem running inside in someone else's
936 * container. Container's seem to prefer commons logging so we
937 * ain't messing them doing the below.
938 *
939 * @throws IOException
940 * @throws SecurityException
941 */
942 protected static void patchLogging()
943 throws SecurityException, IOException {
944 if (System.getProperty("java.util.logging.config.class") != null) {
945 return;
946 }
947
948 if (System.getProperty("java.util.logging.config.file") != null) {
949 return;
950 }
951
952
953
954 LogManager.getLogManager().
955 readConfiguration(getPropertiesInputStream());
956 }
957
958 /***
959 * Configure our trust store.
960 *
961 * If system property is defined, then use it for our truststore. Otherwise
962 * use the heritrix truststore under conf directory if it exists.
963 *
964 * <p>If we're not launched from the command-line, we will not be able
965 * to find our truststore. The truststore is nor normally used so rare
966 * should this be a problem (In case where we don't use find our trust
967 * store, we'll use the 'default' -- either the JVMs or the containers).
968 */
969 protected static void configureTrustStore() {
970
971 final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";
972 String value = System.getProperty(TRUSTSTORE_KEY);
973 File confdir = null;
974 try {
975 confdir = getConfdir(false);
976 } catch (IOException e) {
977 logger.log(Level.WARNING, "Failed to get confdir.", e);
978 }
979 if ((value == null || value.length() <= 0) && confdir != null) {
980
981 File heritrixStore = new File(confdir, "heritrix.cacerts");
982 if(heritrixStore.exists()) {
983 value = heritrixStore.getAbsolutePath();
984 }
985 }
986
987 if (value != null && value.length() > 0) {
988 System.setProperty(TRUSTSTORE_KEY, value);
989 }
990 }
991
992 /***
993 * Run the selftest
994 *
995 * @param oneSelfTestName Name of a test if we are to run one only rather
996 * than the default running all tests.
997 * @param port Port number to use for web UI.
998 *
999 * @exception Exception
1000 * @return Status of how selftest startup went.
1001 */
1002 protected static String selftest(final String oneSelfTestName,
1003 final int port)
1004 throws Exception {
1005
1006 final String SELFTEST = "selftest";
1007 Heritrix.httpServer = new SimpleHttpServer(SELFTEST,
1008 Heritrix.adminContext, LOCALHOST_ONLY, port, true);
1009
1010
1011
1012
1013
1014 Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext,
1015 SELFTEST, SELFTEST, SELFTEST);
1016 Heritrix.httpServer.startServer();
1017
1018
1019 File selftestDir = (isDevelopment())?
1020 new File(getConfdir(), SELFTEST):
1021 new File(File.separator + SELFTEST);
1022 File crawlOrderFile = new File(selftestDir, "order.xml");
1023
1024
1025
1026
1027 final String ROOTURI = "127.0.0.1:" + Integer.toString(port);
1028 String selfTestUrl = "http://" + ROOTURI + '/';
1029 if (oneSelfTestName != null && oneSelfTestName.length() > 0) {
1030 selfTestUrl += (oneSelfTestName + '/');
1031 }
1032 CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),
1033 oneSelfTestName, selfTestUrl);
1034 Heritrix h = new Heritrix("Selftest", true, cjh);
1035 CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");
1036 job = h.getJobHandler().newJob(job, null, SELFTEST,
1037 "Integration self test", selfTestUrl, CrawlJob.PRIORITY_AVERAGE);
1038 h.getJobHandler().addJob(job);
1039
1040 CredentialStore cs = (CredentialStore)job.getSettingsHandler().
1041 getOrder().getAttribute(CredentialStore.ATTR_NAME);
1042 for (Iterator i = cs.iterator(null); i.hasNext();) {
1043 ((Credential)i.next()).setCredentialDomain(null, ROOTURI);
1044 }
1045 h.getJobHandler().startCrawler();
1046 StringBuffer buffer = new StringBuffer();
1047 buffer.append("Heritrix " + Heritrix.getVersion() +
1048 " selftest started.");
1049 buffer.append("\nSelftest first crawls " + selfTestUrl +
1050 " and then runs an analysis.");
1051 buffer.append("\nResult of analysis printed to " +
1052 getHeritrixOut() + " when done.");
1053 buffer.append("\nSelftest job directory for logs and arcs:\n" +
1054 job.getDirectory().getAbsolutePath());
1055 return buffer.toString();
1056 }
1057
1058 /***
1059 * Launch the crawler without a web UI and run the passed crawl only.
1060 *
1061 * Specialized version of {@link #launch()}.
1062 *
1063 * @param crawlOrderFile The crawl order to crawl.
1064 * @throws InitializationException
1065 * @throws InvalidAttributeValueException
1066 * @return Status string.
1067 */
1068 protected String doOneCrawl(String crawlOrderFile)
1069 throws InitializationException, InvalidAttributeValueException {
1070 return doOneCrawl(crawlOrderFile, null);
1071 }
1072
1073 /***
1074 * Launch the crawler without a web UI and run passed crawl only.
1075 *
1076 * Specialized version of {@link #launch()}.
1077 *
1078 * @param crawlOrderFile The crawl order to crawl.
1079 * @param listener Register this crawl status listener before starting
1080 * crawl (You can use this listener to notice end-of-crawl).
1081 * @throws InitializationException
1082 * @throws InvalidAttributeValueException
1083 * @return Status string.
1084 */
1085 protected String doOneCrawl(String crawlOrderFile,
1086 CrawlStatusListener listener)
1087 throws InitializationException, InvalidAttributeValueException {
1088 XMLSettingsHandler handler =
1089 new XMLSettingsHandler(new File(crawlOrderFile));
1090 handler.initialize();
1091 CrawlController controller = new CrawlController();
1092 controller.initialize(handler);
1093 if (listener != null) {
1094 controller.addCrawlStatusListener(listener);
1095 }
1096 controller.requestCrawlStart();
1097 return "Crawl started using " + crawlOrderFile + ".";
1098 }
1099
1100 /***
1101 * Launch the crawler for a web UI.
1102 *
1103 * Crawler hangs around waiting on jobs.
1104 *
1105 * @exception Exception
1106 * @return A status string describing how the launch went.
1107 * @throws Exception
1108 */
1109 public String launch() throws Exception {
1110 return launch(null, false);
1111 }
1112
1113 /***
1114 * Launch the crawler for a web UI.
1115 *
1116 * Crawler hangs around waiting on jobs.
1117 *
1118 * @param crawlOrderFile File to crawl. May be null.
1119 * @param runMode Whether crawler should be set to run mode.
1120 *
1121 * @exception Exception
1122 * @return A status string describing how the launch went.
1123 */
1124 public String launch(String crawlOrderFile, boolean runMode)
1125 throws Exception {
1126 String status = null;
1127 if (crawlOrderFile != null) {
1128 addCrawlJob(crawlOrderFile, "Autolaunched", "", "");
1129 if(runMode) {
1130 this.jobHandler.startCrawler();
1131 status = "Job being crawled: " + crawlOrderFile;
1132 } else {
1133 status = "Crawl job ready and pending: " + crawlOrderFile;
1134 }
1135 } else if(runMode) {
1136
1137
1138
1139 this.jobHandler.startCrawler();
1140 status = "Crawler set to run mode.";
1141 }
1142 return status;
1143 }
1144
1145 /***
1146 * Start up the embedded Jetty webserver instance.
1147 * This is done when we're run from the command-line.
1148 * @param port Port number to use for web UI.
1149 * @param adminLoginPassword Compound of login and password.
1150 * @throws Exception
1151 * @return Status on webserver startup.
1152 * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword)
1153 */
1154 protected static String startEmbeddedWebserver(final int port,
1155 final boolean lho, final String adminLoginPassword)
1156 throws Exception {
1157 ArrayList<String> hosts = new ArrayList<String>();
1158 if (lho) {
1159 hosts.add("127.0.0.1");
1160 }
1161 return startEmbeddedWebserver(hosts, port, adminLoginPassword);
1162 }
1163
1164
1165 /***
1166 * Parses a list of host names.
1167 *
1168 * <p>If the given string is <code>/</code>, then an empty
1169 * collection is returned. This indicates that all available network
1170 * interfaces should be used.
1171 *
1172 * <p>Otherwise, the string must contain a comma-separated list of
1173 * IP addresses or host names. The parsed list is then returned.
1174 *
1175 * @param hosts the string to parse
1176 * @return the parsed collection of hosts
1177 */
1178 private static Collection<String> parseHosts(String hosts) {
1179 hosts = hosts.trim();
1180 if (hosts.equals("/")) {
1181 return new ArrayList<String>(1);
1182 }
1183 String[] hostArray = hosts.split(",");
1184 for (int i = 0; i < hostArray.length; i++) {
1185 hostArray[i] = hostArray[i].trim();
1186 }
1187 return Arrays.asList(hostArray);
1188 }
1189
1190 /***
1191 * Start up the embedded Jetty webserver instance.
1192 * This is done when we're run from the command-line.
1193 *
1194 * @param hosts a list of IP addresses or hostnames to bind to, or an
1195 * empty collection to bind to all available network
1196 * interfaces
1197 * @param port Port number to use for web UI.
1198 * @param adminLoginPassword Compound of login and password.
1199 * @throws Exception
1200 * @return Status on webserver startup.
1201 */
1202 protected static String startEmbeddedWebserver(Collection<String> hosts,
1203 int port, String adminLoginPassword)
1204 throws Exception {
1205 adminUsername = adminLoginPassword.
1206 substring(0, adminLoginPassword.indexOf(":"));
1207 adminPassword = adminLoginPassword.
1208 substring(adminLoginPassword.indexOf(":") + 1);
1209 Heritrix.httpServer = new SimpleHttpServer("admin",
1210 Heritrix.adminContext, hosts, port, false);
1211
1212 final String DOTWAR = ".war";
1213 final String SELFTEST = "selftest";
1214
1215
1216 File[] wars = getWarsdir().listFiles();
1217 for(int i = 0; i < wars.length; i++) {
1218 if(wars[i].isFile()) {
1219 final String warName = wars[i].getName();
1220 final String warNameNC = warName.toLowerCase();
1221 if(warNameNC.endsWith(DOTWAR) &&
1222 !warNameNC.equals(ADMIN + DOTWAR) &&
1223 !warNameNC.equals(SELFTEST + DOTWAR)) {
1224 int dot = warName.indexOf('.');
1225 Heritrix.httpServer.addWebapp(warName.substring(0, dot),
1226 null, true);
1227 }
1228 }
1229 }
1230
1231
1232
1233 final String ROLE = ADMIN;
1234 Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext,
1235 adminUsername, adminPassword, ROLE);
1236 Heritrix.httpServer.startServer();
1237 StringBuffer buffer = new StringBuffer();
1238 buffer.append("Heritrix " + Heritrix.getVersion() + " is running.");
1239 for (String host: httpServer.getHosts()) {
1240 buffer.append("\nWeb console is at: http://");
1241 buffer.append(host).append(':').append(port);
1242 }
1243 buffer.append("\nWeb console login and password: " +
1244 adminUsername + "/" + adminPassword);
1245 return buffer.toString();
1246 }
1247
1248 /***
1249 * Replace existing administrator login info with new info.
1250 *
1251 * @param newUsername new administrator login username
1252 * @param newPassword new administrator login password
1253 */
1254 public static void resetAuthentication(String newUsername,
1255 String newPassword) {
1256 Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,
1257 newUsername, newPassword);
1258 adminUsername = newUsername;
1259 adminPassword = newPassword;
1260 logger.info("administrative login changed to "
1261 +newUsername+":"+newPassword);
1262 }
1263
1264 protected static CrawlJob createCrawlJob(CrawlJobHandler handler,
1265 File crawlOrderFile, String name)
1266 throws InvalidAttributeValueException {
1267 XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile);
1268 settings.initialize();
1269 return new CrawlJob(handler.getNextJobUID(), name, settings,
1270 new CrawlJobErrorHandler(Level.SEVERE),
1271 CrawlJob.PRIORITY_HIGH,
1272 crawlOrderFile.getAbsoluteFile().getParentFile());
1273 }
1274
1275 /***
1276 * This method is called when we have an order file to hand that we want
1277 * to base a job on. It leaves the order file in place and just starts up
1278 * a job that uses all the order points to for locations for logs, etc.
1279 * @param orderPathOrUrl Path to an order file or to a seeds file.
1280 * @param name Name to use for this job.
1281 * @param description
1282 * @param seeds
1283 * @return A status string.
1284 * @throws IOException
1285 * @throws FatalConfigurationException
1286 */
1287 public String addCrawlJob(String orderPathOrUrl, String name,
1288 String description, String seeds)
1289 throws IOException, FatalConfigurationException {
1290 if (!UURI.hasScheme(orderPathOrUrl)) {
1291
1292 return addCrawlJob(new File(orderPathOrUrl), name, description,
1293 seeds);
1294 }
1295
1296
1297 URL url = new URL(orderPathOrUrl);
1298
1299
1300
1301
1302 String result = null;
1303 URLConnection connection = url.openConnection();
1304 if (connection instanceof HttpURLConnection) {
1305 result = addCrawlJob(url, (HttpURLConnection)connection, name,
1306 description, seeds);
1307 } else if (connection instanceof FileURLConnection) {
1308 result = addCrawlJob(new File(url.getPath()), name, description,
1309 seeds);
1310 } else {
1311 throw new UnsupportedOperationException("No support for "
1312 + connection);
1313 }
1314
1315 return result;
1316 }
1317
1318 protected String addCrawlJob(final URL url,
1319 final HttpURLConnection connection,
1320 final String name, final String description, final String seeds)
1321 throws IOException, FatalConfigurationException {
1322
1323 boolean isJar = url.getPath() != null &&
1324 url.getPath().toLowerCase().endsWith(JAR_SUFFIX);
1325
1326 File localFile = File.createTempFile(Heritrix.class.getName(),
1327 isJar? JAR_SUFFIX: null, TMPDIR);
1328 connection.connect();
1329 String result = null;
1330 try {
1331 IoUtils.readFullyToFile(connection.getInputStream(), localFile);
1332 result = addCrawlJob(localFile, name, description, seeds);
1333 } catch (IOException ioe) {
1334
1335 localFile.delete();
1336 localFile = null;
1337 } finally {
1338 connection.disconnect();
1339
1340
1341
1342 if (isJar && localFile != null && localFile.exists()) {
1343 localFile.delete();
1344 }
1345 }
1346 return result;
1347 }
1348
1349 protected String addCrawlJob(final File order, final String name,
1350 final String description, final String seeds)
1351 throws FatalConfigurationException, IOException {
1352 CrawlJob addedJob = null;
1353 if (this.jobHandler == null) {
1354 throw new NullPointerException("Heritrix jobhandler is null.");
1355 }
1356 try {
1357 if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {
1358 return addCrawlJobBasedonJar(order, name, description, seeds);
1359 }
1360 addedJob = this.jobHandler.
1361 addJob(createCrawlJob(this.jobHandler, order, name));
1362 } catch (InvalidAttributeValueException e) {
1363 FatalConfigurationException fce = new FatalConfigurationException(
1364 "Converted InvalidAttributeValueException on " +
1365 order.getAbsolutePath() + ": " + e.getMessage());
1366 fce.setStackTrace(e.getStackTrace());
1367 }
1368 return addedJob != null? addedJob.getUID(): null;
1369 }
1370
1371 /***
1372 * Undo jar file and use as basis for a new job.
1373 * @param jarFile Pointer to file that holds jar.
1374 * @param name Name to use for new job.
1375 * @param description
1376 * @param seeds
1377 * @return Message.
1378 * @throws IOException
1379 * @throws FatalConfigurationException
1380 */
1381 protected String addCrawlJobBasedonJar(final File jarFile,
1382 final String name, final String description, final String seeds)
1383 throws IOException, FatalConfigurationException {
1384 if (jarFile == null || !jarFile.exists()) {
1385 throw new FileNotFoundException(jarFile.getAbsolutePath());
1386 }
1387
1388
1389
1390
1391 File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar",
1392 TMPDIR);
1393 dir.delete();
1394 dir.mkdir();
1395 try {
1396 org.archive.crawler.util.IoUtils.unzip(jarFile, dir);
1397
1398 File orderFile = new File(dir, "order.xml");
1399 if (!orderFile.exists()) {
1400 throw new IOException("Missing order: " +
1401 orderFile.getAbsolutePath());
1402 }
1403 CrawlJob job =
1404 createCrawlJobBasedOn(orderFile, name, description, seeds);
1405
1406
1407 File seedsFile = new File(dir, "seeds.txt");
1408 if (seedsFile.exists()) {
1409 FileUtils.copyFiles(seedsFile, new File(job.getDirectory(),
1410 seedsFile.getName()));
1411 }
1412 addCrawlJob(job);
1413 return job.getUID();
1414 } finally {
1415
1416
1417
1418
1419
1420 org.archive.util.FileUtils.deleteDir(dir);
1421 }
1422 }
1423
1424 public String addCrawlJobBasedOn(String jobUidOrProfile,
1425 String name, String description, String seeds) {
1426 try {
1427 CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);
1428 if (cj == null) {
1429 throw new InvalidAttributeValueException(jobUidOrProfile +
1430 " is not a job UID or profile name (Job UIDs are " +
1431 " usually the 14 digit date portion of job name).");
1432 }
1433 CrawlJob job = addCrawlJobBasedOn(
1434 cj.getSettingsHandler().getOrderFile(), name, description,
1435 seeds);
1436 return job.getUID();
1437 } catch (Exception e) {
1438 e.printStackTrace();
1439 return "Exception on " + jobUidOrProfile + ": " + e.getMessage();
1440 }
1441 }
1442
1443 protected CrawlJob addCrawlJobBasedOn(final File orderFile,
1444 final String name, final String description, final String seeds)
1445 throws FatalConfigurationException {
1446 return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description,
1447 seeds));
1448 }
1449
1450 protected CrawlJob createCrawlJobBasedOn(final File orderFile,
1451 final String name, final String description, final String seeds)
1452 throws FatalConfigurationException {
1453 CrawlJob job = getJobHandler().newJob(orderFile, name, description,
1454 seeds);
1455 return CrawlJobHandler.ensureNewJobWritten(job, name, description);
1456 }
1457
1458 protected CrawlJob addCrawlJob(final CrawlJob job) {
1459 return getJobHandler().addJob(job);
1460 }
1461
1462 public void startCrawling() {
1463 if (getJobHandler() == null) {
1464 throw new NullPointerException("Heritrix jobhandler is null.");
1465 }
1466 getJobHandler().startCrawler();
1467 }
1468
1469 public void stopCrawling() {
1470 if (getJobHandler() == null) {
1471 throw new NullPointerException("Heritrix jobhandler is null.");
1472 }
1473 getJobHandler().stopCrawler();
1474 }
1475
1476 /***
1477 * Get the heritrix version.
1478 *
1479 * @return The heritrix version. May be null.
1480 */
1481 public static String getVersion() {
1482 return System.getProperty("heritrix.version");
1483 }
1484
1485 /***
1486 * Get the job handler
1487 *
1488 * @return The CrawlJobHandler being used.
1489 */
1490 public CrawlJobHandler getJobHandler() {
1491 return this.jobHandler;
1492 }
1493
1494 /***
1495 * Get the configuration directory.
1496 * @return The conf directory under HERITRIX_HOME or null if none can
1497 * be found.
1498 * @throws IOException
1499 */
1500 public static File getConfdir()
1501 throws IOException {
1502 return getConfdir(true);
1503 }
1504
1505 /***
1506 * Get the configuration directory.
1507 * @param fail Throw IOE if can't find directory if true, else just
1508 * return null.
1509 * @return The conf directory under HERITRIX_HOME or null (or an IOE) if
1510 * can't be found.
1511 * @throws IOException
1512 */
1513 public static File getConfdir(final boolean fail)
1514 throws IOException {
1515 final String key = "heritrix.conf";
1516
1517 String tmp = System.getProperty(key);
1518
1519 if (tmp == null || tmp.length() == 0) {
1520 return getSubDir("conf", fail);
1521 }
1522 File dir = new File(tmp);
1523 if (!dir.exists()) {
1524 if (fail) {
1525 throw new IOException("Cannot find conf dir: " + tmp);
1526 } else {
1527 logger.log(Level.WARNING, "Specified " + key +
1528 " dir does not exist. Falling back on default");
1529 }
1530 dir = getSubDir("conf", fail);
1531 }
1532 return dir;
1533 }
1534
1535 /***
1536 * @return Returns the httpServer. May be null if one was not started.
1537 */
1538 public static SimpleHttpServer getHttpServer() {
1539 return Heritrix.httpServer;
1540 }
1541
1542 /***
1543 * @throws IOException
1544 * @return Returns the directory under which reside the WAR files
1545 * we're to load into the servlet container.
1546 */
1547 public static File getWarsdir()
1548 throws IOException {
1549 return getSubDir("webapps");
1550 }
1551
1552 /***
1553 * Prepars for program shutdown. This method does it's best to prepare the
1554 * program so that it can exit normally. It will kill the httpServer and
1555 * terminate any running job.<br>
1556 * It is advisible to wait a few (~1000) millisec after calling this method
1557 * and before calling performHeritrixShutDown() to allow as many threads as
1558 * possible to finish what they are doing.
1559 */
1560 public static void prepareHeritrixShutDown() {
1561
1562
1563
1564 final Object [] keys = Heritrix.instances.keySet().toArray();
1565 for (int i = 0; i < keys.length; i++) {
1566 ((Heritrix)Heritrix.instances.get(keys[i])).destroy();
1567 }
1568
1569 try {
1570 deregisterJndi(getJndiContainerName());
1571 } catch (NameNotFoundException e) {
1572
1573 logger.log(Level.WARNING, "deregistration of jndi", e);
1574 } catch (Exception e) {
1575 e.printStackTrace();
1576 }
1577
1578 if(Heritrix.httpServer != null) {
1579
1580 try {
1581 Heritrix.httpServer.stopServer();
1582 } catch (InterruptedException e) {
1583
1584
1585 e.printStackTrace();
1586 } finally {
1587 Heritrix.httpServer = null;
1588 }
1589 }
1590 }
1591
1592 /***
1593 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1594 * prior to this method.
1595 */
1596 public static void performHeritrixShutDown() {
1597 performHeritrixShutDown(0);
1598 }
1599
1600 /***
1601 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1602 * prior to this method.
1603 *
1604 * @param exitCode Code to pass System.exit.
1605 *
1606 */
1607 public static void performHeritrixShutDown(int exitCode) {
1608 System.exit(exitCode);
1609 }
1610
1611 /***
1612 * Shutdown all running heritrix instances and the JVM.
1613 * Assumes stop has already been called.
1614 * @param exitCode Exit code to pass system exit.
1615 */
1616 public static void shutdown(final int exitCode) {
1617 getShutdownThread(true, exitCode, "Heritrix shutdown").start();
1618 }
1619
1620 protected static Thread getShutdownThread(final boolean sysexit,
1621 final int exitCode, final String name) {
1622 Thread t = new Thread(name) {
1623 public void run() {
1624 Heritrix.prepareHeritrixShutDown();
1625 if (sysexit) {
1626 Heritrix.performHeritrixShutDown(exitCode);
1627 }
1628 }
1629 };
1630 t.setDaemon(true);
1631 return t;
1632 }
1633
1634 public static void shutdown() {
1635 shutdown(0);
1636 }
1637
1638 /***
1639 * Register Heritrix with JNDI, JMX, and with the static hashtable of all
1640 * Heritrix instances known to this JVM.
1641 *
1642 * If launched from cmdline, register Heritrix MBean if an agent to register
1643 * ourselves with. Usually this method will only have effect if we're
1644 * running in a 1.5.0 JDK and command line options such as
1645 * '-Dcom.sun.management.jmxremote.port=8082
1646 * -Dcom.sun.management.jmxremote.authenticate=false
1647 * -Dcom.sun.management.jmxremote.ssl=false' are supplied.
1648 * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring
1649 * and Management Using JMX</a>
1650 * for more on the command line options and how to connect to the
1651 * Heritrix bean using the JDK 1.5.0 jconsole tool. We register currently
1652 * with first server we find (TODO: Make configurable).
1653 *
1654 * <p>If we register successfully with a JMX agent, then part of the
1655 * registration will include our registering ourselves with JNDI.
1656 *
1657 * <p>Finally, add the heritrix instance to the hashtable of all the
1658 * Heritrix instances floating in the current VM. This latter registeration
1659 * happens whether or no there is a JMX agent to register with. This is
1660 * a list we keep out of convenience so its easy iterating over all
1661 * all instances calling stop when main application is going down.
1662 *
1663 * @param h Instance of heritrix to register.
1664 * @param name Name to use for this Heritrix instance.
1665 * @param jmxregister True if we are to register this instance with JMX.
1666 * @throws NullPointerException
1667 * @throws MalformedObjectNameException
1668 * @throws NotCompliantMBeanException
1669 * @throws MBeanRegistrationException
1670 * @throws InstanceAlreadyExistsException
1671 */
1672 protected static void registerHeritrix(final Heritrix h,
1673 final String name, final boolean jmxregister)
1674 throws MalformedObjectNameException, InstanceAlreadyExistsException,
1675 MBeanRegistrationException, NotCompliantMBeanException {
1676 MBeanServer server = getMBeanServer();
1677 if (server != null) {
1678
1679
1680
1681 if (jmxregister) {
1682 ObjectName objName = (name == null || name.length() <= 0)?
1683 getJmxObjectName(): getJmxObjectName(name);
1684 registerMBean(server, h, objName);
1685 }
1686 } else {
1687
1688
1689
1690
1691 Heritrix.instances.put(h.getNoJmxName(), h);
1692 }
1693 }
1694
1695 protected static void unregisterHeritrix(final Heritrix h)
1696 throws InstanceNotFoundException, MBeanRegistrationException,
1697 NullPointerException {
1698 MBeanServer server = getMBeanServer();
1699 if (server != null) {
1700 server.unregisterMBean(h.mbeanName);
1701 } else {
1702
1703
1704 Heritrix.instances.remove(h.getNoJmxName());
1705 }
1706 }
1707
1708 /***
1709 * Get MBeanServer.
1710 * Currently uses first MBeanServer found. This will definetly not be whats
1711 * always wanted. TODO: Make which server settable. Also, if none, put up
1712 * our own MBeanServer.
1713 * @return An MBeanServer to register with or null.
1714 */
1715 public static MBeanServer getMBeanServer() {
1716 MBeanServer result = null;
1717 List servers = MBeanServerFactory.findMBeanServer(null);
1718 if (servers == null) {
1719 return result;
1720 }
1721 for (Iterator i = servers.iterator(); i.hasNext();) {
1722 MBeanServer server = (MBeanServer)i.next();
1723 if (server == null) {
1724 continue;
1725 }
1726 result = server;
1727 break;
1728 }
1729 return result;
1730 }
1731
1732 public static MBeanServer registerMBean(final Object objToRegister,
1733 final String name, final String type)
1734 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1735 NotCompliantMBeanException {
1736 MBeanServer server = getMBeanServer();
1737 if (server != null) {
1738 server = registerMBean(server, objToRegister, name, type);
1739 }
1740 return server;
1741 }
1742
1743 public static MBeanServer registerMBean(final MBeanServer server,
1744 final Object objToRegister, final String name, final String type)
1745 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1746 NotCompliantMBeanException {
1747 try {
1748 Hashtable<String,String> ht = new Hashtable<String,String>();
1749 ht.put(JmxUtils.NAME, name);
1750 ht.put(JmxUtils.TYPE, type);
1751 registerMBean(server, objToRegister,
1752 new ObjectName(CRAWLER_PACKAGE, ht));
1753 } catch (MalformedObjectNameException e) {
1754 e.printStackTrace();
1755 }
1756 return server;
1757 }
1758
1759 public static MBeanServer registerMBean(final MBeanServer server,
1760 final Object objToRegister, final ObjectName objName)
1761 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1762 NotCompliantMBeanException {
1763 server.registerMBean(objToRegister, objName);
1764 return server;
1765 }
1766
1767 public static void unregisterMBean(final MBeanServer server,
1768 final String name, final String type) {
1769 if (server == null) {
1770 return;
1771 }
1772 try {
1773 unregisterMBean(server, getJmxObjectName(name, type));
1774 } catch (MalformedObjectNameException e) {
1775 e.printStackTrace();
1776 }
1777 }
1778
1779 public static void unregisterMBean(final MBeanServer server,
1780 final ObjectName name) {
1781 try {
1782 server.unregisterMBean(name);
1783 logger.info("Unregistered bean " + name.getCanonicalName());
1784 } catch (InstanceNotFoundException e) {
1785 e.printStackTrace();
1786 } catch (MBeanRegistrationException e) {
1787 e.printStackTrace();
1788 } catch (NullPointerException e) {
1789 e.printStackTrace();
1790 }
1791 }
1792
1793 /***
1794 * @return Name to use when no JMX agent available.
1795 */
1796 protected String getNoJmxName() {
1797 return this.getClass().getName();
1798 }
1799
1800 public static ObjectName getJmxObjectName()
1801 throws MalformedObjectNameException, NullPointerException {
1802 return getJmxObjectName("Heritrix", JmxUtils.SERVICE);
1803 }
1804
1805 public static ObjectName getJmxObjectName(final String name)
1806 throws MalformedObjectNameException, NullPointerException {
1807 return getJmxObjectName(name, JmxUtils.SERVICE);
1808 }
1809
1810 public static ObjectName getJmxObjectName(final String name,
1811 final String type)
1812 throws MalformedObjectNameException, NullPointerException {
1813 Hashtable<String,String> ht = new Hashtable<String,String>();
1814 ht.put(JmxUtils.NAME, name);
1815 ht.put(JmxUtils.TYPE, type);
1816 return new ObjectName(CRAWLER_PACKAGE, ht);
1817 }
1818
1819 /***
1820 * @return Returns true if Heritrix was launched from the command line.
1821 * (When launched from command line, we do stuff like put up a web server
1822 * to manage our web interface and we register ourselves with the first
1823 * available jmx agent).
1824 */
1825 public static boolean isCommandLine() {
1826 return Heritrix.commandLine;
1827 }
1828
1829 /***
1830 * @return True if heritrix has been started.
1831 */
1832 public boolean isStarted() {
1833 return this.jobHandler != null;
1834 }
1835
1836 public String getStatus() {
1837 StringBuffer buffer = new StringBuffer();
1838 if (this.getJobHandler() != null) {
1839 buffer.append("isRunning=");
1840 buffer.append(this.getJobHandler().isRunning());
1841 buffer.append(" isCrawling=");
1842 buffer.append(this.getJobHandler().isCrawling());
1843 buffer.append(" alertCount=");
1844 buffer.append(getAlertsCount());
1845 buffer.append(" newAlertCount=");
1846 buffer.append(getNewAlertsCount());
1847 if (this.getJobHandler().isCrawling()) {
1848 buffer.append(" currentJob=");
1849 buffer.append(this.getJobHandler().getCurrentJob().
1850 getJmxJobName());
1851 }
1852 }
1853 return buffer.toString();
1854 }
1855
1856
1857 public int getAlertsCount() {
1858 return this.alertManager.getCount();
1859 }
1860
1861 public int getNewAlertsCount() {
1862 return this.alertManager.getNewCount();
1863 }
1864
1865 public Vector getAlerts() {
1866 return this.alertManager.getAll();
1867 }
1868
1869 public Vector getNewAlerts() {
1870 return this.alertManager.getNewAll();
1871 }
1872
1873 public SinkHandlerLogRecord getAlert(final String id) {
1874 return this.alertManager.get(id);
1875 }
1876
1877 public void readAlert(final String id) {
1878 this.alertManager.read(id);
1879 }
1880
1881 public void removeAlert(final String id) {
1882 this.alertManager.remove(id);
1883 }
1884
1885 /***
1886 * Start Heritrix.
1887 *
1888 * Used by JMX and webapp initialization for starting Heritrix.
1889 * Not by the cmdline launched Heritrix. Idempotent.
1890 * If start is called by JMX, then new instance of Heritrix is automatically
1891 * registered w/ JMX Agent. If started by webapp, need to register the new
1892 * Heritrix instance.
1893 */
1894 public void start() {
1895
1896
1897 if (!Heritrix.isCommandLine() && !isStarted()) {
1898 try {
1899 logger.info(launch());
1900 } catch (Exception e) {
1901 e.printStackTrace();
1902 }
1903 }
1904 }
1905
1906 /***
1907 * Stop Heritrix.
1908 *
1909 * Used by JMX and webapp initialization for stopping Heritrix.
1910 */
1911 public void stop() {
1912 if (this.jobHandler != null) {
1913 this.jobHandler.stop();
1914 }
1915 }
1916
1917 public String interrupt(String threadName) {
1918 String result = "Thread " + threadName + " not found";
1919 ThreadGroup group = Thread.currentThread().getThreadGroup();
1920 if (group == null) {
1921 return result;
1922 }
1923
1924
1925 ThreadGroup parent = null;
1926 while((parent = group.getParent()) != null) {
1927 group = parent;
1928 }
1929
1930
1931 final int max = group.activeCount() * 2;
1932 Thread [] threads = new Thread[max];
1933 int threadCount = group.enumerate(threads, true);
1934 if (threadCount >= max) {
1935 logger.info("Some threads not found...array too small: " +
1936 max);
1937 }
1938 for (int j = 0; j < threadCount; j++) {
1939 if (threads[j].getName().equals(threadName)) {
1940 threads[j].interrupt();
1941 result = "Interrupt sent to " + threadName;
1942 break;
1943 }
1944 }
1945 return result;
1946 }
1947
1948
1949
1950 /***
1951 * Build up the MBean info for Heritrix main.
1952 * @return Return created mbean info instance.
1953 */
1954 protected OpenMBeanInfoSupport buildMBeanInfo() {
1955 OpenMBeanAttributeInfoSupport[] attributes =
1956 new OpenMBeanAttributeInfoSupport[Heritrix.ATTRIBUTE_LIST.size()];
1957 OpenMBeanConstructorInfoSupport[] constructors =
1958 new OpenMBeanConstructorInfoSupport[1];
1959 OpenMBeanOperationInfoSupport[] operations =
1960 new OpenMBeanOperationInfoSupport[Heritrix.OPERATION_LIST.size()];
1961 MBeanNotificationInfo[] notifications =
1962 new MBeanNotificationInfo[0];
1963
1964
1965 attributes[0] =
1966 new OpenMBeanAttributeInfoSupport(Heritrix.STATUS_ATTR,
1967 "Short basic status message", SimpleType.STRING, true,
1968 false, false);
1969
1970 attributes[1] =
1971 new OpenMBeanAttributeInfoSupport(Heritrix.VERSION_ATTR,
1972 "Heritrix version", SimpleType.STRING, true, false, false);
1973
1974 attributes[2] =
1975 new OpenMBeanAttributeInfoSupport(Heritrix.ISRUNNING_ATTR,
1976 "Whether the crawler is running", SimpleType.BOOLEAN, true,
1977 false, false);
1978
1979 attributes[3] =
1980 new OpenMBeanAttributeInfoSupport(Heritrix.ISCRAWLING_ATTR,
1981 "Whether the crawler is crawling", SimpleType.BOOLEAN, true,
1982 false, false);
1983
1984 attributes[4] =
1985 new OpenMBeanAttributeInfoSupport(Heritrix.ALERTCOUNT_ATTR,
1986 "The number of alerts", SimpleType.INTEGER, true, false, false);
1987
1988 attributes[5] =
1989 new OpenMBeanAttributeInfoSupport(Heritrix.NEWALERTCOUNT_ATTR,
1990 "The number of new alerts", SimpleType.INTEGER, true, false,
1991 false);
1992
1993 attributes[6] =
1994 new OpenMBeanAttributeInfoSupport(Heritrix.CURRENTJOB_ATTR,
1995 "The name of the job currently being crawled",
1996 SimpleType.STRING, true, false, false);
1997
1998
1999 constructors[0] = new OpenMBeanConstructorInfoSupport(
2000 "HeritrixOpenMBean", "Constructs Heritrix OpenMBean instance ",
2001 new OpenMBeanParameterInfoSupport[0]);
2002
2003
2004 operations[0] = new OpenMBeanOperationInfoSupport(
2005 Heritrix.START_OPER, "Start Heritrix instance", null,
2006 SimpleType.VOID, MBeanOperationInfo.ACTION);
2007
2008 operations[1] = new OpenMBeanOperationInfoSupport(
2009 Heritrix.STOP_OPER, "Stop Heritrix instance", null,
2010 SimpleType.VOID, MBeanOperationInfo.ACTION);
2011
2012 OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[1];
2013 args[0] = new OpenMBeanParameterInfoSupport("threadName",
2014 "Name of thread to send interrupt", SimpleType.STRING);
2015 operations[2] = new OpenMBeanOperationInfoSupport(
2016 Heritrix.INTERRUPT_OPER, "Send thread an interrupt " +
2017 "(Used debugging)", args, SimpleType.STRING,
2018 MBeanOperationInfo.ACTION_INFO);
2019
2020 operations[3] = new OpenMBeanOperationInfoSupport(
2021 Heritrix.START_CRAWLING_OPER, "Set Heritrix instance " +
2022 "into crawling mode", null, SimpleType.VOID,
2023 MBeanOperationInfo.ACTION);
2024
2025 operations[4] = new OpenMBeanOperationInfoSupport(
2026 Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance " +
2027 " crawling mode", null, SimpleType.VOID,
2028 MBeanOperationInfo.ACTION);
2029
2030 args = new OpenMBeanParameterInfoSupport[4];
2031 args[0] = new OpenMBeanParameterInfoSupport("pathOrURL",
2032 "Path/URL to order or jar of order+seed",
2033 SimpleType.STRING);
2034 args[1] = new OpenMBeanParameterInfoSupport("name",
2035 "Basename for new job", SimpleType.STRING);
2036 args[2] = new OpenMBeanParameterInfoSupport("description",
2037 "Description to save with new job", SimpleType.STRING);
2038 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2039 "Initial seed(s)", SimpleType.STRING);
2040 operations[5] = new OpenMBeanOperationInfoSupport(
2041 Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args,
2042 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2043
2044 args = new OpenMBeanParameterInfoSupport[4];
2045 args[0] = new OpenMBeanParameterInfoSupport("uidOrName",
2046 "Job UID or profile name", SimpleType.STRING);
2047 args[1] = new OpenMBeanParameterInfoSupport("name",
2048 "Basename for new job", SimpleType.STRING);
2049 args[2] = new OpenMBeanParameterInfoSupport("description",
2050 "Description to save with new job", SimpleType.STRING);
2051 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2052 "Initial seed(s)", SimpleType.STRING);
2053 operations[6] = new OpenMBeanOperationInfoSupport(
2054 Heritrix.ADD_CRAWL_JOB_BASEDON_OPER,
2055 "Add a new crawl job based on passed Job UID or profile",
2056 args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2057
2058 args = new OpenMBeanParameterInfoSupport[1];
2059 args[0] = new OpenMBeanParameterInfoSupport("UID",
2060 "Job UID", SimpleType.STRING);
2061 operations[7] = new OpenMBeanOperationInfoSupport(DELETE_CRAWL_JOB_OPER,
2062 "Delete/stop this crawl job", args, SimpleType.VOID,
2063 MBeanOperationInfo.ACTION);
2064
2065 args = new OpenMBeanParameterInfoSupport[1];
2066 args[0] = new OpenMBeanParameterInfoSupport("index",
2067 "Zero-based index into array of alerts", SimpleType.INTEGER);
2068 operations[8] = new OpenMBeanOperationInfoSupport(
2069 Heritrix.ALERT_OPER, "Return alert at passed index", args,
2070 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2071
2072 try {
2073 this.jobCompositeType = new CompositeType("job",
2074 "Job attributes", JOB_KEYS,
2075 new String [] {"Job unique ID", "Job name", "Job status"},
2076 new OpenType [] {SimpleType.STRING, SimpleType.STRING,
2077 SimpleType.STRING});
2078 this.jobsTabularType = new TabularType("jobs", "List of jobs",
2079 this.jobCompositeType, new String [] {"uid"});
2080 } catch (OpenDataException e) {
2081
2082 throw new RuntimeException(e);
2083 }
2084 operations[9] = new OpenMBeanOperationInfoSupport(
2085 Heritrix.PENDING_JOBS_OPER,
2086 "List of pending jobs (or null if none)", null,
2087 this.jobsTabularType, MBeanOperationInfo.INFO);
2088 operations[10] = new OpenMBeanOperationInfoSupport(
2089 Heritrix.COMPLETED_JOBS_OPER,
2090 "List of completed jobs (or null if none)", null,
2091 this.jobsTabularType, MBeanOperationInfo.INFO);
2092
2093 args = new OpenMBeanParameterInfoSupport[2];
2094 args[0] = new OpenMBeanParameterInfoSupport("uid",
2095 "Job unique ID", SimpleType.STRING);
2096 args[1] = new OpenMBeanParameterInfoSupport("name",
2097 "Report name (e.g. crawl-report, etc.)",
2098 SimpleType.STRING);
2099 operations[11] = new OpenMBeanOperationInfoSupport(
2100 Heritrix.CRAWLEND_REPORT_OPER, "Return crawl-end report", args,
2101 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2102
2103 operations[12] = new OpenMBeanOperationInfoSupport(
2104 Heritrix.SHUTDOWN_OPER, "Shutdown container", null,
2105 SimpleType.VOID, MBeanOperationInfo.ACTION);
2106
2107 args = new OpenMBeanParameterInfoSupport[2];
2108 args[0] = new OpenMBeanParameterInfoSupport("level",
2109 "Log level: e.g. SEVERE, WARNING, etc.", SimpleType.STRING);
2110 args[1] = new OpenMBeanParameterInfoSupport("message",
2111 "Log message", SimpleType.STRING);
2112 operations[13] = new OpenMBeanOperationInfoSupport(Heritrix.LOG_OPER,
2113 "Add a log message", args, SimpleType.VOID,
2114 MBeanOperationInfo.ACTION);
2115
2116 operations[14] = new OpenMBeanOperationInfoSupport(
2117 Heritrix.DESTROY_OPER, "Destroy Heritrix instance", null,
2118 SimpleType.VOID, MBeanOperationInfo.ACTION);
2119
2120 operations[15] = new OpenMBeanOperationInfoSupport(
2121 Heritrix.TERMINATE_CRAWL_JOB_OPER,
2122 "Returns false if no current job", null, SimpleType.BOOLEAN,
2123 MBeanOperationInfo.ACTION);
2124
2125 operations[16] = new OpenMBeanOperationInfoSupport(
2126 Heritrix.REBIND_JNDI_OPER,
2127 "Rebinds this Heritrix with JNDI.", null,
2128 SimpleType.VOID, MBeanOperationInfo.ACTION);
2129
2130
2131 return new OpenMBeanInfoSupport(this.getClass().getName(),
2132 "Heritrix Main OpenMBean", attributes, constructors, operations,
2133 notifications);
2134 }
2135
2136 public Object getAttribute(String attribute_name)
2137 throws AttributeNotFoundException {
2138 if (attribute_name == null) {
2139 throw new RuntimeOperationsException(
2140 new IllegalArgumentException("Attribute name cannot be null"),
2141 "Cannot call getAttribute with null attribute name");
2142 }
2143 if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) {
2144 throw new AttributeNotFoundException("Attribute " +
2145 attribute_name + " is unimplemented.");
2146 }
2147
2148
2149
2150
2151 if (attribute_name.equals(STATUS_ATTR)) {
2152 return getStatus();
2153 }
2154 if (attribute_name.equals(VERSION_ATTR)) {
2155 return getVersion();
2156 }
2157
2158 if (attribute_name.equals(ISRUNNING_ATTR)) {
2159 return new Boolean(this.getJobHandler().isRunning());
2160 }
2161 if (attribute_name.equals(ISCRAWLING_ATTR)) {
2162 return new Boolean(this.getJobHandler().isCrawling());
2163 }
2164 if (attribute_name.equals(ALERTCOUNT_ATTR)) {
2165 return new Integer(getAlertsCount());
2166 }
2167 if (attribute_name.equals(NEWALERTCOUNT_ATTR)) {
2168 return new Integer(getNewAlertsCount());
2169 }
2170 if (attribute_name.equals(CURRENTJOB_ATTR)) {
2171 if (this.getJobHandler().isCrawling()) {
2172 return this.getJobHandler().getCurrentJob().getJmxJobName();
2173 }
2174 return null;
2175 }
2176 throw new AttributeNotFoundException("Attribute " +
2177 attribute_name + " not found.");
2178 }
2179
2180 public void setAttribute(Attribute attribute)
2181 throws AttributeNotFoundException {
2182 throw new AttributeNotFoundException("No attribute can be set in " +
2183 "this MBean");
2184 }
2185
2186 public AttributeList getAttributes(String [] attributeNames) {
2187 if (attributeNames == null) {
2188 throw new RuntimeOperationsException(
2189 new IllegalArgumentException("attributeNames[] cannot be " +
2190 "null"), "Cannot call getAttributes with null attribute " +
2191 "names");
2192 }
2193 AttributeList resultList = new AttributeList();
2194 if (attributeNames.length == 0) {
2195 return resultList;
2196 }
2197 for (int i = 0; i < attributeNames.length; i++) {
2198 try {
2199 Object value = getAttribute(attributeNames[i]);
2200 resultList.add(new Attribute(attributeNames[i], value));
2201 } catch (Exception e) {
2202 e.printStackTrace();
2203 }
2204 }
2205 return(resultList);
2206 }
2207
2208 public AttributeList setAttributes(AttributeList attributes) {
2209 return new AttributeList();
2210 }
2211
2212 public Object invoke(final String operationName, final Object[] params,
2213 final String[] signature)
2214 throws ReflectionException {
2215 if (operationName == null) {
2216 throw new RuntimeOperationsException(
2217 new IllegalArgumentException("Operation name cannot be null"),
2218 "Cannot call invoke with null operation name");
2219 }
2220
2221 if (logger.isLoggable(Level.INFO)) {
2222 String paramsString = "";
2223 for (Object o : params) {
2224 paramsString.concat("[" + o.toString() + "]");
2225 }
2226 logger.info("JMX invoke: " + operationName + " [" + paramsString
2227 + "]");
2228 }
2229
2230
2231
2232
2233 if (operationName.equals(START_OPER)) {
2234 JmxUtils.checkParamsCount(START_OPER, params, 0);
2235 start();
2236 return null;
2237 }
2238 if (operationName.equals(STOP_OPER)) {
2239 JmxUtils.checkParamsCount(STOP_OPER, params, 0);
2240 stop();
2241 return null;
2242 }
2243 if (operationName.equals(DESTROY_OPER)) {
2244 JmxUtils.checkParamsCount(DESTROY_OPER, params, 0);
2245 destroy();
2246 return null;
2247 }
2248 if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) {
2249 JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params, 0);
2250 return new Boolean(this.jobHandler.terminateCurrentJob());
2251 }
2252 if (operationName.equals(REBIND_JNDI_OPER)) {
2253 JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0);
2254 try {
2255 registerContainerJndi();
2256 } catch (MalformedObjectNameException e) {
2257 throw new RuntimeOperationsException(new RuntimeException(e));
2258 } catch (UnknownHostException e) {
2259 throw new RuntimeOperationsException(new RuntimeException(e));
2260 } catch (NamingException e) {
2261 throw new RuntimeOperationsException(new RuntimeException(e));
2262 }
2263 return null;
2264 }
2265 if (operationName.equals(SHUTDOWN_OPER)) {
2266 JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0);
2267 Heritrix.shutdown();
2268 return null;
2269 }
2270 if (operationName.equals(LOG_OPER)) {
2271 JmxUtils.checkParamsCount(LOG_OPER, params, 2);
2272 logger.log(Level.parse((String)params[0]), (String)params[1]);
2273 return null;
2274 }
2275 if (operationName.equals(INTERRUPT_OPER)) {
2276 JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1);
2277 return interrupt((String)params[0]);
2278 }
2279 if (operationName.equals(START_CRAWLING_OPER)) {
2280 JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0);
2281 startCrawling();
2282 return null;
2283 }
2284 if (operationName.equals(STOP_CRAWLING_OPER)) {
2285 JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0);
2286 stopCrawling();
2287 return null;
2288 }
2289 if (operationName.equals(ADD_CRAWL_JOB_OPER)) {
2290 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4);
2291 try {
2292 return addCrawlJob((String)params[0], (String)params[1],
2293 checkForEmptyPlaceHolder((String)params[2]),
2294 checkForEmptyPlaceHolder((String)params[3]));
2295 } catch (IOException e) {
2296 throw new RuntimeOperationsException(new RuntimeException(e));
2297 } catch (FatalConfigurationException e) {
2298 throw new RuntimeOperationsException(new RuntimeException(e));
2299 }
2300 }
2301 if (operationName.equals(DELETE_CRAWL_JOB_OPER)) {
2302 JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1);
2303 this.jobHandler.deleteJob((String)params[0]);
2304 return null;
2305 }
2306
2307 if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) {
2308 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER, params, 4);
2309 return addCrawlJobBasedOn((String)params[0], (String)params[1],
2310 checkForEmptyPlaceHolder((String)params[2]),
2311 checkForEmptyPlaceHolder((String)params[3]));
2312 }
2313 if (operationName.equals(ALERT_OPER)) {
2314 JmxUtils.checkParamsCount(ALERT_OPER, params, 1);
2315 SinkHandlerLogRecord slr = null;
2316 if (this.alertManager.getCount() > 0) {
2317
2318
2319
2320 slr = (SinkHandlerLogRecord)this.alertManager.getAll().
2321 get(((Integer)params[0]).intValue());
2322 }
2323 return (slr != null)? slr.toString(): null;
2324 }
2325
2326 if (operationName.equals(PENDING_JOBS_OPER)) {
2327 JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0);
2328 try {
2329 return makeJobsTabularData(getJobHandler().getPendingJobs());
2330 } catch (OpenDataException e) {
2331 throw new RuntimeOperationsException(new RuntimeException(e));
2332 }
2333 }
2334
2335 if (operationName.equals(COMPLETED_JOBS_OPER)) {
2336 JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0);
2337 try {
2338 return makeJobsTabularData(getJobHandler().getCompletedJobs());
2339 } catch (OpenDataException e) {
2340 throw new RuntimeOperationsException(new RuntimeException(e));
2341 }
2342 }
2343
2344 if (operationName.equals(CRAWLEND_REPORT_OPER)) {
2345 JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2);
2346 try {
2347 return getCrawlendReport((String)params[0], (String) params[1]);
2348 } catch (IOException e) {
2349 throw new RuntimeOperationsException(new RuntimeException(e));
2350 }
2351 }
2352
2353 throw new ReflectionException(
2354 new NoSuchMethodException(operationName),
2355 "Cannot find the operation " + operationName);
2356 }
2357
2358 /***
2359 * Return named crawl end report for job with passed uid.
2360 * Crawler makes reports when its finished its crawl. Use this method
2361 * to get a String version of one of these files.
2362 * @param jobUid The unique ID for the job whose reports you want to see
2363 * (Must be a completed job).
2364 * @param reportName Name of report minus '.txt' (e.g. crawl-report).
2365 * @return String version of the on-disk report.
2366 * @throws IOException
2367 */
2368 protected String getCrawlendReport(String jobUid, String reportName)
2369 throws IOException {
2370 CrawlJob job = getJobHandler().getJob(jobUid);
2371 if (job == null) {
2372 throw new IOException("No such job: " + jobUid);
2373 }
2374 File report = new File(job.getDirectory(), reportName + ".txt");
2375 if (!report.exists()) {
2376 throw new FileNotFoundException(report.getAbsolutePath());
2377 }
2378 return FileUtils.readFileAsString(report);
2379 }
2380
2381 protected TabularData makeJobsTabularData(List jobs)
2382 throws OpenDataException {
2383 if (jobs == null || jobs.size() == 0) {
2384 return null;
2385 }
2386 TabularData td = new TabularDataSupport(this.jobsTabularType);
2387 for (Iterator i = jobs.iterator(); i.hasNext();) {
2388 CrawlJob job = (CrawlJob)i.next();
2389 CompositeData cd = new CompositeDataSupport(this.jobCompositeType,
2390 JOB_KEYS,
2391 new String [] {job.getUID(), job.getJobName(), job.getStatus()});
2392 td.put(cd);
2393 }
2394 return td;
2395 }
2396
2397 /***
2398 * If passed str has placeholder for the empty string, return the empty
2399 * string else return orginal.
2400 * Dumb jmx clients can't pass empty string so they'll pass a representation
2401 * of empty string such as ' ' or '-'. Convert such strings to empty
2402 * string.
2403 * @param str String to check.
2404 * @return Original <code>str</code> or empty string if <code>str</code>
2405 * contains a placeholder for the empty-string (e.g. '-', or ' ').
2406 */
2407 protected String checkForEmptyPlaceHolder(String str) {
2408 return TextUtils.matches("-| +", str)? "": str;
2409 }
2410
2411 public MBeanInfo getMBeanInfo() {
2412 return this.openMBeanInfo;
2413 }
2414
2415 /***
2416 * @return Name this instance registered in JMX (Only available after JMX
2417 * registration).
2418 */
2419 public ObjectName getMBeanName() {
2420 return this.mbeanName;
2421 }
2422
2423 public ObjectName preRegister(MBeanServer server, ObjectName name)
2424 throws Exception {
2425 this.mbeanServer = server;
2426 @SuppressWarnings("unchecked")
2427 Hashtable<String,String> ht = name.getKeyPropertyList();
2428 if (!ht.containsKey(JmxUtils.NAME)) {
2429 throw new IllegalArgumentException("Name property required" +
2430 name.getCanonicalName());
2431 }
2432 if (!ht.containsKey(JmxUtils.TYPE)) {
2433 ht.put(JmxUtils.TYPE, JmxUtils.SERVICE);
2434 name = new ObjectName(name.getDomain(), ht);
2435 }
2436 this.mbeanName = addGuiPort(addVitals(name));
2437 Heritrix.instances.put(this.mbeanName.
2438 getCanonicalKeyPropertyListString(), this);
2439 return this.mbeanName;
2440 }
2441
2442 /***
2443 * Add vital stats to passed in ObjectName.
2444 * @param name ObjectName to add to.
2445 * @return name with host, guiport, and jmxport added.
2446 * @throws UnknownHostException
2447 * @throws MalformedObjectNameException
2448 * @throws NullPointerException
2449 */
2450 protected static ObjectName addVitals(ObjectName name)
2451 throws UnknownHostException, MalformedObjectNameException,
2452 NullPointerException {
2453 @SuppressWarnings("unchecked")
2454 Hashtable<String,String> ht = name.getKeyPropertyList();
2455 if (!ht.containsKey(JmxUtils.HOST)) {
2456 ht.put(JmxUtils.HOST, InetAddress.getLocalHost().getHostName());
2457 name = new ObjectName(name.getDomain(), ht);
2458 }
2459 if (!ht.containsKey(JmxUtils.JMX_PORT)) {
2460
2461
2462
2463
2464 String p = System.getProperty("com.sun.management.jmxremote.port");
2465 if (p != null && p.length() > 0) {
2466 ht.put(JmxUtils.JMX_PORT, p);
2467 name = new ObjectName(name.getDomain(), ht);
2468 }
2469 }
2470 return name;
2471 }
2472
2473 protected static ObjectName addGuiPort(ObjectName name)
2474 throws MalformedObjectNameException, NullPointerException {
2475 @SuppressWarnings("unchecked")
2476 Hashtable<String,String> ht = name.getKeyPropertyList();
2477 if (!ht.containsKey(JmxUtils.GUI_PORT)) {
2478
2479 if (Heritrix.gui) {
2480 ht.put(JmxUtils.GUI_PORT, Integer.toString(Heritrix.guiPort));
2481 name = new ObjectName(name.getDomain(), ht);
2482 }
2483 }
2484 return name;
2485 }
2486
2487 public void postRegister(Boolean registrationDone) {
2488 if (logger.isLoggable(Level.INFO)) {
2489 logger.info(
2490 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2491 this.mbeanServer, registrationDone.booleanValue()));
2492 }
2493 try {
2494 registerJndi(this.mbeanName);
2495 } catch (Exception e) {
2496 logger.log(Level.SEVERE, "Failed jndi registration", e);
2497 }
2498 }
2499
2500 public void preDeregister() throws Exception {
2501 deregisterJndi(this.mbeanName);
2502 }
2503
2504 public void postDeregister() {
2505 Heritrix.instances.
2506 remove(this.mbeanName.getCanonicalKeyPropertyListString());
2507 if (logger.isLoggable(Level.INFO)) {
2508 logger.info(JmxUtils.getLogUnregistrationMsg(
2509 this.mbeanName.getCanonicalName(), this.mbeanServer));
2510 }
2511 }
2512
2513 protected static void registerContainerJndi()
2514 throws MalformedObjectNameException, NullPointerException,
2515 UnknownHostException, NamingException {
2516 registerJndi(getJndiContainerName());
2517 }
2518
2519 protected static void registerJndi(final ObjectName name)
2520 throws NullPointerException, NamingException {
2521 Context c = getJndiContext();
2522 if (c == null) {
2523 return;
2524 }
2525 CompoundName key = JndiUtils.bindObjectName(c, name);
2526 if (logger.isLoggable(Level.FINE)) {
2527 logger.fine("Bound '" + key + "' to '" + JndiUtils.
2528 getCompoundName(c.getNameInNamespace()).toString()
2529 + "' jndi context");
2530 }
2531 }
2532
2533 protected static void deregisterJndi(final ObjectName name)
2534 throws NullPointerException, NamingException {
2535 Context c = getJndiContext();
2536 if (c == null) {
2537 return;
2538 }
2539 CompoundName key = JndiUtils.unbindObjectName(c, name);
2540 if (logger.isLoggable(Level.FINE)) {
2541 logger.fine("Unbound '" + key + "' from '" +
2542 JndiUtils.getCompoundName(c.getNameInNamespace()).toString() +
2543 "' jndi context");
2544 }
2545 }
2546
2547 /***
2548 * @return Jndi context for the crawler or null if none found.
2549 * @throws NamingException
2550 */
2551 protected static Context getJndiContext() throws NamingException {
2552 Context c = null;
2553 try {
2554 c = JndiUtils.getSubContext(CRAWLER_PACKAGE);
2555 } catch (NoInitialContextException e) {
2556 logger.fine("No JNDI Context: " + e.toString());
2557 }
2558 return c;
2559 }
2560
2561 /***
2562 * @return Jndi container name -- the name to use for the 'container' that
2563 * can host zero or more heritrix instances (Return a JMX ObjectName. We
2564 * use ObjectName because then we're sync'd with JMX naming and ObjectName
2565 * has nice parsing).
2566 * @throws NullPointerException
2567 * @throws MalformedObjectNameException
2568 * @throws UnknownHostException
2569 */
2570 protected static ObjectName getJndiContainerName()
2571 throws MalformedObjectNameException, NullPointerException,
2572 UnknownHostException {
2573 ObjectName objName = new ObjectName(CRAWLER_PACKAGE, "type",
2574 "container");
2575 return addVitals(objName);
2576 }
2577
2578 /***
2579 * @return Return all registered instances of Heritrix (Rare are there
2580 * more than one).
2581 */
2582 public static Map getInstances() {
2583 return Heritrix.instances;
2584 }
2585
2586 /***
2587 * @return True if only one instance of Heritrix.
2588 */
2589 public static boolean isSingleInstance() {
2590 return Heritrix.instances != null && Heritrix.instances.size() == 1;
2591 }
2592
2593 /***
2594 * @return Returns single instance or null if no instance or multiple.
2595 */
2596 public static Heritrix getSingleInstance() {
2597 return !isSingleInstance()?
2598 null:
2599 (Heritrix)Heritrix.instances.
2600 get(Heritrix.instances.keySet().iterator().next());
2601 }
2602 }