1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.framework;
26
27 import java.io.DataInputStream;
28 import java.io.DataOutputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileNotFoundException;
32 import java.io.FileOutputStream;
33 import java.io.IOException;
34 import java.io.ObjectInputStream;
35 import java.io.StringWriter;
36 import java.net.InetAddress;
37 import java.net.UnknownHostException;
38 import java.util.ArrayList;
39 import java.util.Arrays;
40 import java.util.Iterator;
41 import java.util.List;
42 import java.util.concurrent.atomic.AtomicInteger;
43 import java.util.logging.Logger;
44
45 import javax.management.AttributeNotFoundException;
46 import javax.management.MBeanException;
47 import javax.management.ReflectionException;
48 import javax.xml.transform.SourceLocator;
49 import javax.xml.transform.Templates;
50 import javax.xml.transform.Transformer;
51 import javax.xml.transform.TransformerConfigurationException;
52 import javax.xml.transform.TransformerException;
53 import javax.xml.transform.TransformerFactory;
54 import javax.xml.transform.stream.StreamResult;
55 import javax.xml.transform.stream.StreamSource;
56
57 import org.archive.crawler.Heritrix;
58 import org.archive.crawler.datamodel.CoreAttributeConstants;
59 import org.archive.crawler.datamodel.CrawlHost;
60 import org.archive.crawler.datamodel.CrawlOrder;
61 import org.archive.crawler.datamodel.CrawlURI;
62 import org.archive.crawler.datamodel.FetchStatusCodes;
63 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
64 import org.archive.crawler.event.CrawlStatusListener;
65 import org.archive.crawler.settings.SimpleType;
66 import org.archive.crawler.settings.StringList;
67 import org.archive.crawler.settings.Type;
68 import org.archive.crawler.settings.XMLSettingsHandler;
69 import org.archive.io.ObjectPlusFilesInputStream;
70 import org.archive.io.WriterPool;
71 import org.archive.io.WriterPoolMember;
72
73 /***
74 * Abstract implementation of a file pool processor.
75 * Subclass to implement for a particular {@link WriterPoolMember} instance.
76 * @author Parker Thompson
77 * @author stack
78 */
79 public abstract class WriterPoolProcessor extends Processor
80 implements CoreAttributeConstants, CrawlStatusListener, FetchStatusCodes {
81 private final Logger logger = Logger.getLogger(this.getClass().getName());
82
83 /***
84 * Key to use asking settings for file compression value.
85 */
86 public static final String ATTR_COMPRESS = "compress";
87
88 /***
89 * Default as to whether we do compression of files.
90 */
91 public static final boolean DEFAULT_COMPRESS = true;
92
93 /***
94 * Key to use asking settings for file prefix value.
95 */
96 public static final String ATTR_PREFIX = "prefix";
97
98 /***
99 * Key to use asking settings for arc path value.
100 */
101 public static final String ATTR_PATH ="path";
102
103 /***
104 * Key to use asking settings for file suffix value.
105 */
106 public static final String ATTR_SUFFIX = "suffix";
107
108 /***
109 * Key to use asking settings for file max size value.
110 */
111 public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
112
113 /***
114 * Key to get maximum pool size.
115 *
116 * This key is for maximum files active in the pool.
117 */
118 public static final String ATTR_POOL_MAX_ACTIVE = "pool-max-active";
119
120 /***
121 * Key to get maximum wait on pool object before we give up and
122 * throw IOException.
123 */
124 public static final String ATTR_POOL_MAX_WAIT = "pool-max-wait";
125
126 /***
127 * Key for the maximum bytes to write attribute.
128 */
129 public static final String ATTR_MAX_BYTES_WRITTEN =
130 "total-bytes-to-write";
131
132 /***
133 * Key for whether to skip writing records of content-digest repeats
134 */
135 public static final String ATTR_SKIP_IDENTICAL_DIGESTS =
136 "skip-identical-digests";
137
138 /***
139 * CrawlURI annotation indicating no record was written
140 */
141 protected static final String ANNOTATION_UNWRITTEN = "unwritten";
142
143 /***
144 * Default maximum file size.
145 * TODO: Check that subclasses can set a different MAX_FILE_SIZE and
146 * it will be used in the constructor as default.
147 */
148 private static final int DEFAULT_MAX_FILE_SIZE = 100000000;
149
150 /***
151 * Default path list.
152 *
153 * TODO: Confirm this one gets picked up.
154 */
155 private static final String [] DEFAULT_PATH = {"crawl-store"};
156
157 /***
158 * Reference to pool.
159 */
160 transient private WriterPool pool = null;
161
162 /***
163 * Total number of bytes written to disc.
164 */
165 private long totalBytesWritten = 0;
166
167 /***
168 * Calculate metadata once only.
169 */
170 transient private List<String> cachedMetadata = null;
171
172
173 /***
174 * @param name Name of this processor.
175 */
176 public WriterPoolProcessor(String name) {
177 this(name, "Pool of files processor");
178 }
179
180 /***
181 * @param name Name of this processor.
182 * @param description Description for this processor.
183 */
184 public WriterPoolProcessor(final String name,
185 final String description) {
186 super(name, description);
187 Type e = addElementToDefinition(
188 new SimpleType(ATTR_COMPRESS, "Compress files when " +
189 "writing to disk.", new Boolean(DEFAULT_COMPRESS)));
190 e.setOverrideable(false);
191 e = addElementToDefinition(
192 new SimpleType(ATTR_PREFIX,
193 "File prefix. " +
194 "The text supplied here will be used as a prefix naming " +
195 "writer files. For example if the prefix is 'IAH', " +
196 "then file names will look like " +
197 "IAH-20040808101010-0001-HOSTNAME.arc.gz " +
198 "...if writing ARCs (The prefix will be " +
199 "separated from the date by a hyphen).",
200 WriterPoolMember.DEFAULT_PREFIX));
201 e = addElementToDefinition(
202 new SimpleType(ATTR_SUFFIX, "Suffix to tag onto " +
203 "files. '${HOSTNAME}' in the suffix will be " +
204 "replaced with the local hostname. If empty, " +
205 "no suffix will be added.",
206 WriterPoolMember.DEFAULT_SUFFIX));
207 e.setOverrideable(false);
208 e = addElementToDefinition(
209 new SimpleType(ATTR_MAX_SIZE_BYTES, "Max size of each file",
210 new Long(DEFAULT_MAX_FILE_SIZE)));
211 e.setOverrideable(false);
212 e = addElementToDefinition(
213 new StringList(ATTR_PATH, "Where to files. " +
214 "Supply absolute or relative path. If relative, files " +
215 "will be written relative to " +
216 "the " + CrawlOrder.ATTR_DISK_PATH + "setting." +
217 " If more than one path specified, we'll round-robin" +
218 " dropping files to each. This setting is safe" +
219 " to change midcrawl (You can remove and add new dirs" +
220 " as the crawler progresses).", getDefaultPath()));
221 e.setOverrideable(false);
222 e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_ACTIVE,
223 "Maximum active files in pool. " +
224 "This setting cannot be varied over the life of a crawl.",
225 new Integer(WriterPool.DEFAULT_MAX_ACTIVE)));
226 e.setOverrideable(false);
227 e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_WAIT,
228 "Maximum time to wait on pool element" +
229 " (milliseconds). This setting cannot be varied over the life" +
230 " of a crawl.",
231 new Integer(WriterPool.DEFAULT_MAXIMUM_WAIT)));
232 e.setOverrideable(false);
233 e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_WRITTEN,
234 "Total file bytes to write to disk." +
235 " Once the size of all files on disk has exceeded this " +
236 "limit, this processor will stop the crawler. " +
237 "A value of zero means no upper limit.", new Long(0)));
238 e.setOverrideable(false);
239 e.setExpertSetting(true);
240 e = addElementToDefinition(new SimpleType(ATTR_SKIP_IDENTICAL_DIGESTS,
241 "Whether to skip the writing of a record when URI " +
242 "history information is available and indicates the " +
243 "prior fetch had an identical content digest. " +
244 "Default is false.", new Boolean(false)));
245 e.setOverrideable(true);
246 e.setExpertSetting(true);
247 }
248
249 protected String [] getDefaultPath() {
250 return DEFAULT_PATH;
251 }
252
253 public synchronized void initialTasks() {
254
255 getSettingsHandler().getOrder().getController().
256 addCrawlStatusListener(this);
257 setupPool(new AtomicInteger());
258
259 if (getSettingsHandler().getOrder().getController().
260 isCheckpointRecover()) {
261 checkpointRecover();
262 }
263 }
264
265 protected AtomicInteger getSerialNo() {
266 return ((WriterPool)getPool()).getSerialNo();
267 }
268
269 /***
270 * Set up pool of files.
271 */
272 protected abstract void setupPool(final AtomicInteger serialNo);
273
274 /***
275 * Writes a CrawlURI and its associated data to store file.
276 *
277 * Currently this method understands the following uri types: dns, http,
278 * and https.
279 *
280 * @param curi CrawlURI to process.
281 */
282 protected abstract void innerProcess(CrawlURI curi);
283
284 protected void checkBytesWritten() {
285 long max = getMaxToWrite();
286 if (max <= 0) {
287 return;
288 }
289 if (max <= this.totalBytesWritten) {
290 getController().requestCrawlStop("Finished - Maximum bytes (" +
291 Long.toString(max) + ") written");
292 }
293 }
294
295 /***
296 * Whether the given CrawlURI should be written to archive files.
297 * Annotates CrawlURI with a reason for any negative answer.
298 *
299 * @param curi CrawlURI
300 * @return true if URI should be written; false otherwise
301 */
302 protected boolean shouldWrite(CrawlURI curi) {
303
304 if(((Boolean)getUncheckedAttribute(curi, ATTR_SKIP_IDENTICAL_DIGESTS))
305 && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
306 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":identicalDigest");
307 return false;
308 }
309 String scheme = curi.getUURI().getScheme().toLowerCase();
310
311 boolean retVal;
312 if (scheme.equals("dns")) {
313 retVal = curi.getFetchStatus() == S_DNS_SUCCESS;
314 } else if (scheme.equals("http") || scheme.equals("https")) {
315 retVal = curi.getFetchStatus() > 0 && curi.isHttpTransaction();
316 } else if (scheme.equals("ftp")) {
317 retVal = curi.getFetchStatus() == 200;
318 } else {
319
320 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":scheme");
321 return false;
322 }
323 if (retVal == false) {
324
325 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":status");
326 return false;
327 }
328 return true;
329 }
330
331 /***
332 * Return IP address of given URI suitable for recording (as in a
333 * classic ARC 5-field header line).
334 *
335 * @param curi CrawlURI
336 * @return String of IP address
337 */
338 protected String getHostAddress(CrawlURI curi) {
339
340 if(curi.getUURI().getScheme().toLowerCase().equals("dns")) {
341 return curi.getString(A_DNS_SERVER_IP_LABEL);
342 }
343
344 CrawlHost h = getController().getServerCache().getHostFor(curi);
345 if (h == null) {
346 throw new NullPointerException("Crawlhost is null for " +
347 curi + " " + curi.getVia());
348 }
349 InetAddress a = h.getIP();
350 if (a == null) {
351 throw new NullPointerException("Address is null for " +
352 curi + " " + curi.getVia() + ". Address " +
353 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)?
354 "was never looked up.":
355 (System.currentTimeMillis() - h.getIpFetched()) +
356 " ms ago."));
357 }
358 return h.getIP().getHostAddress();
359 }
360
361 /***
362 * Version of getAttributes that catches and logs exceptions
363 * and returns null if failure to fetch the attribute.
364 * @param name Attribute name.
365 * @return Attribute or null.
366 */
367 public Object getAttributeUnchecked(String name) {
368 Object result = null;
369 try {
370 result = super.getAttribute(name);
371 } catch (AttributeNotFoundException e) {
372 logger.warning(e.getLocalizedMessage());
373 } catch (MBeanException e) {
374 logger.warning(e.getLocalizedMessage());
375 } catch (ReflectionException e) {
376 logger.warning(e.getLocalizedMessage());
377 }
378 return result;
379 }
380
381 /***
382 * Max size we want files to be (bytes).
383 *
384 * Default is ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE. Note that ARC
385 * files will usually be bigger than maxSize; they'll be maxSize + length
386 * to next boundary.
387 * @return ARC maximum size.
388 */
389 public long getMaxSize() {
390 Object obj = getAttributeUnchecked(ATTR_MAX_SIZE_BYTES);
391 return (obj == null)? DEFAULT_MAX_FILE_SIZE: ((Long)obj).longValue();
392 }
393
394 public String getPrefix() {
395 Object obj = getAttributeUnchecked(ATTR_PREFIX);
396 return (obj == null)? WriterPoolMember.DEFAULT_PREFIX: (String)obj;
397 }
398
399 public List<File> getOutputDirs() {
400 Object obj = getAttributeUnchecked(ATTR_PATH);
401 List list = (obj == null)? Arrays.asList(DEFAULT_PATH): (StringList)obj;
402 ArrayList<File> results = new ArrayList<File>();
403 for (Iterator i = list.iterator(); i.hasNext();) {
404 String path = (String)i.next();
405 File f = new File(path);
406 if (!f.isAbsolute()) {
407 f = new File(getController().getDisk(), path);
408 }
409 if (!f.exists()) {
410 try {
411 f.mkdirs();
412 } catch (Exception e) {
413 e.printStackTrace();
414 continue;
415 }
416 }
417 results.add(f);
418 }
419 return results;
420 }
421
422 public boolean isCompressed() {
423 Object obj = getAttributeUnchecked(ATTR_COMPRESS);
424 return (obj == null)? DEFAULT_COMPRESS:
425 ((Boolean)obj).booleanValue();
426 }
427
428 /***
429 * @return Returns the poolMaximumActive.
430 */
431 public int getPoolMaximumActive() {
432 Object obj = getAttributeUnchecked(ATTR_POOL_MAX_ACTIVE);
433 return (obj == null)? WriterPool.DEFAULT_MAX_ACTIVE:
434 ((Integer)obj).intValue();
435 }
436
437 /***
438 * @return Returns the poolMaximumWait.
439 */
440 public int getPoolMaximumWait() {
441 Object obj = getAttributeUnchecked(ATTR_POOL_MAX_WAIT);
442 return (obj == null)? WriterPool.DEFAULT_MAXIMUM_WAIT:
443 ((Integer)obj).intValue();
444 }
445
446 public String getSuffix() {
447 Object obj = getAttributeUnchecked(ATTR_SUFFIX);
448 String sfx = (obj == null)?
449 WriterPoolMember.DEFAULT_SUFFIX: (String)obj;
450 sfx = sfx.trim();
451 if (sfx.contains(WriterPoolMember.HOSTNAME_VARIABLE)) {
452 String str = "localhost.localdomain";
453 try {
454 str = InetAddress.getLocalHost().getHostName();
455 } catch (UnknownHostException ue) {
456 logger.severe("Failed getHostAddress for this host: " + ue);
457 }
458 sfx = sfx.replace(WriterPoolMember.HOSTNAME_VARIABLE, str);
459 }
460 return sfx;
461 }
462
463 public long getMaxToWrite() {
464 Object obj = getAttributeUnchecked(ATTR_MAX_BYTES_WRITTEN);
465 return (obj == null)? 0: ((Long)obj).longValue();
466 }
467
468 public void crawlEnding(String sExitMessage) {
469 this.pool.close();
470 }
471
472 public void crawlEnded(String sExitMessage) {
473
474 }
475
476
477
478
479 public void crawlStarted(String message) {
480
481 }
482
483 protected String getCheckpointStateFile() {
484 return this.getClass().getName() + ".state";
485 }
486
487 public void crawlCheckpoint(File checkpointDir) throws IOException {
488 int serial = getSerialNo().get();
489 if (this.pool.getNumActive() > 0) {
490
491
492
493
494
495 serial = getSerialNo().incrementAndGet();
496 }
497 saveCheckpointSerialNumber(checkpointDir, serial);
498
499 try {
500 this.pool.close();
501 } finally {
502
503 setupPool(new AtomicInteger(serial));
504 }
505 }
506
507 public void crawlPausing(String statusMessage) {
508
509 }
510
511 public void crawlPaused(String statusMessage) {
512
513 }
514
515 public void crawlResuming(String statusMessage) {
516
517 }
518
519 private void readObject(ObjectInputStream stream)
520 throws IOException, ClassNotFoundException {
521 stream.defaultReadObject();
522 ObjectPlusFilesInputStream coistream =
523 (ObjectPlusFilesInputStream)stream;
524 coistream.registerFinishTask( new Runnable() {
525 public void run() {
526 setupPool(new AtomicInteger());
527 }
528 });
529 }
530
531 protected WriterPool getPool() {
532 return pool;
533 }
534
535 protected void setPool(WriterPool pool) {
536 this.pool = pool;
537 }
538
539 protected long getTotalBytesWritten() {
540 return totalBytesWritten;
541 }
542
543 protected void setTotalBytesWritten(long totalBytesWritten) {
544 this.totalBytesWritten = totalBytesWritten;
545 }
546
547 /***
548 * Called out of {@link #initialTasks()} when recovering a checkpoint.
549 * Restore state.
550 */
551 protected void checkpointRecover() {
552 int serialNo = loadCheckpointSerialNumber();
553 if (serialNo != -1) {
554 getSerialNo().set(serialNo);
555 }
556 }
557
558 /***
559 * @return Serial number from checkpoint state file or if unreadable, -1
560 * (Client should check for -1).
561 */
562 protected int loadCheckpointSerialNumber() {
563 int result = -1;
564
565
566
567 File stateFile = new File(getSettingsHandler().getOrder()
568 .getController().getCheckpointRecover().getDirectory(),
569 getCheckpointStateFile());
570 if (!stateFile.exists()) {
571 logger.info(stateFile.getAbsolutePath()
572 + " doesn't exist so cannot restore Writer serial number.");
573 } else {
574 DataInputStream dis = null;
575 try {
576 dis = new DataInputStream(new FileInputStream(stateFile));
577 result = dis.readShort();
578 } catch (FileNotFoundException e) {
579 e.printStackTrace();
580 } catch (IOException e) {
581 e.printStackTrace();
582 } finally {
583 try {
584 if (dis != null) {
585 dis.close();
586 }
587 } catch (IOException e) {
588 e.printStackTrace();
589 }
590 }
591 }
592 return result;
593 }
594
595 protected void saveCheckpointSerialNumber(final File checkpointDir,
596 final int serialNo)
597 throws IOException {
598
599 File f = new File(checkpointDir, getCheckpointStateFile());
600 DataOutputStream dos = new DataOutputStream(new FileOutputStream(f));
601 try {
602 dos.writeShort(serialNo);
603 } finally {
604 dos.close();
605 }
606 }
607
608 /***
609 * Return list of metadatas to add to first arc file metadata record.
610 *
611 * Default is to stylesheet the order file. To specify stylesheet,
612 * override {@link #getFirstrecordStylesheet()}.
613 *
614 * Get xml files from settingshandler. Currently order file is the
615 * only xml file. We're NOT adding seeds to meta data.
616 *
617 * @return List of strings and/or files to add to arc file as metadata or
618 * null.
619 */
620 public synchronized List<String> getMetadata() {
621 if (this.cachedMetadata != null) {
622 return this.cachedMetadata;
623 }
624 return cacheMetadata();
625 }
626
627 protected synchronized List<String> cacheMetadata() {
628
629
630 if (getFirstrecordStylesheet() == null ||
631 getFirstrecordStylesheet().length() == 0) {
632 this.cachedMetadata = new ArrayList<String>(1);
633 this.cachedMetadata.add("");
634 return this.cachedMetadata;
635 }
636
637 List<String> result = null;
638 if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {
639 logger.warning("Expected xml settings handler (No warcinfo).");
640
641 return result;
642 }
643
644 XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler();
645 File orderFile = xsh.getOrderFile();
646 if (!orderFile.exists() || !orderFile.canRead()) {
647 logger.severe("File " + orderFile.getAbsolutePath() +
648 " is does not exist or is not readable.");
649 } else {
650 result = new ArrayList<String>(1);
651 result.add(getFirstrecordBody(orderFile));
652 }
653 this.cachedMetadata = result;
654 return this.cachedMetadata;
655 }
656
657 /***
658 * @preturn Full path to stylesheet (Its read off the CLASSPATH
659 * as resource).
660 */
661 protected String getFirstrecordStylesheet() {
662 return null;
663 }
664
665 /***
666 * Write the arc metadata body content.
667 *
668 * Its based on the order xml file but into this base we'll add other info
669 * such as machine ip.
670 *
671 * @param orderFile Order file.
672
673 *
674 * @return String that holds the arc metaheader body.
675 */
676 protected String getFirstrecordBody(File orderFile) {
677 String result = null;
678 TransformerFactory factory = TransformerFactory.newInstance();
679 Templates templates = null;
680 Transformer xformer = null;
681 try {
682 templates = factory.newTemplates(new StreamSource(
683 this.getClass().getResourceAsStream(getFirstrecordStylesheet())));
684 xformer = templates.newTransformer();
685
686 xformer.setParameter("software", "Heritrix " +
687 Heritrix.getVersion() + " http://crawler.archive.org");
688 xformer.setParameter("ip",
689 InetAddress.getLocalHost().getHostAddress());
690 xformer.setParameter("hostname",
691 InetAddress.getLocalHost().getHostName());
692 StreamSource source = new StreamSource(
693 new FileInputStream(orderFile));
694 StringWriter writer = new StringWriter();
695 StreamResult target = new StreamResult(writer);
696 xformer.transform(source, target);
697 result= writer.toString();
698 } catch (TransformerConfigurationException e) {
699 logger.severe("Failed transform " + e);
700 } catch (FileNotFoundException e) {
701 logger.severe("Failed transform, file not found " + e);
702 } catch (UnknownHostException e) {
703 logger.severe("Failed transform, unknown host " + e);
704 } catch(TransformerException e) {
705 SourceLocator locator = e.getLocator();
706 int col = locator.getColumnNumber();
707 int line = locator.getLineNumber();
708 String publicId = locator.getPublicId();
709 String systemId = locator.getSystemId();
710 logger.severe("Transform error " + e + ", col " + col + ", line " +
711 line + ", publicId " + publicId + ", systemId " + systemId);
712 }
713
714 return result;
715 }
716 }