1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25 import java.io.File;
26 import java.io.IOException;
27
28 import org.apache.commons.codec.binary.Base64;
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.crawler.event.CrawlStatusListener;
31 import org.archive.crawler.io.CrawlerJournal;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.util.FileUtils;
34 import org.archive.util.IoUtils;
35
36
37
38 /***
39 * Log CrawlURI attributes from latest fetch for consultation by a later
40 * recrawl. Log must be imported into alternate data structure in order
41 * to be consulted.
42 *
43 * @author gojomo
44 * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
45 */
46 public class PersistLogProcessor extends PersistProcessor implements CrawlStatusListener {
47 private static final long serialVersionUID = 1678691994065439346L;
48
49 protected CrawlerJournal log;
50
51 /*** setting for log filename */
52 public static final String ATTR_LOG_FILENAME = "log-filename";
53 /*** default log filename */
54 public static final String DEFAULT_LOG_FILENAME = "persistlog.txtser.gz";
55
56 /***
57 * Usual constructor
58 *
59 * @param name
60 */
61 public PersistLogProcessor(String name) {
62 super(name, "PersistLogProcessor. Logs CrawlURI attributes " +
63 "from latest fetch for consultation by a later recrawl.");
64
65 addElementToDefinition(new SimpleType(ATTR_LOG_FILENAME,
66 "Filename to which to log URI persistence information. " +
67 "Interpreted relative to job logs directory. " +
68 "Default is 'persistlog.txtser.gz'. ",
69 DEFAULT_LOG_FILENAME));
70 }
71
72 @Override
73 protected void initialTasks() {
74
75 getController().addCrawlStatusListener(this);
76 try {
77 File logFile = FileUtils.maybeRelative(getController().getLogsDir(),
78 (String) getUncheckedAttribute(null, ATTR_LOG_FILENAME));
79 log = new CrawlerJournal(logFile);
80 } catch (IOException e) {
81 throw new RuntimeException(e);
82 }
83 }
84
85 @Override
86 protected void finalTasks() {
87 log.close();
88 }
89
90 @Override
91 protected void innerProcess(CrawlURI curi) {
92 if(shouldStore(curi)) {
93 log.writeLine(persistKeyFor(curi), " ", new String(Base64.encodeBase64(IoUtils
94 .serializeToByteArray(curi.getPersistentAList()))));
95 }
96 }
97
98 public void crawlCheckpoint(File checkpointDir) throws Exception {
99
100 log.checkpoint(checkpointDir);
101 }
102
103 public void crawlEnded(String sExitMessage) {
104
105
106 }
107
108 public void crawlEnding(String sExitMessage) {
109
110
111 }
112
113 public void crawlPaused(String statusMessage) {
114
115
116 }
117
118 public void crawlPausing(String statusMessage) {
119
120
121 }
122
123 public void crawlResuming(String statusMessage) {
124
125
126 }
127
128 public void crawlStarted(String message) {
129
130 }
131 }