1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25
26 import java.io.IOException;
27 import java.util.Iterator;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.apache.commons.lang.StringUtils;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.settings.SimpleType;
34 import org.archive.crawler.settings.Type;
35
36 import com.sleepycat.collections.StoredSortedMap;
37 import com.sleepycat.je.DatabaseException;
38
39 import st.ata.util.AList;
40
41 /***
42 * Store CrawlURI attributes from latest fetch to persistent storage for
43 * consultation by a later recrawl.
44 *
45 * @author gojomo
46 * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
47 */
48 public class PersistLoadProcessor extends PersistOnlineProcessor {
49 private static final long serialVersionUID = -1917169316015093131L;
50 private static final Logger logger =
51 Logger.getLogger(PersistLoadProcessor.class.getName());
52
53 /*** file (log) or directory (state/env) from which to preload history **/
54 public static final String ATTR_PRELOAD_SOURCE = "preload-source";
55
56 /***
57 * Usual constructor
58 *
59 * @param name
60 */
61 public PersistLoadProcessor(String name) {
62 super(name, "PersistLoadProcessor. Loads CrawlURI attributes " +
63 "from a previous crawl for current consultation.");
64 Type e;
65 e = addElementToDefinition(new SimpleType(ATTR_PRELOAD_SOURCE,
66 "Source for preloaded persist information. This can be " +
67 "a URL or path to a persist log, or a path to an old " +
68 "state directory.", ""));
69 e.setOverrideable(false);
70 e.setExpertSetting(false);
71 }
72
73
74
75 @Override
76 protected StoredSortedMap<String,AList> initStore() {
77 StoredSortedMap<String,AList> historyMap = super.initStore();
78
79
80 String preloadSource =
81 (String) getUncheckedAttribute(null, ATTR_PRELOAD_SOURCE);
82 if (StringUtils.isNotBlank(preloadSource)) {
83 try {
84 PersistProcessor.copyPersistSourceToHistoryMap(
85 getController().getDisk(), preloadSource, historyMap);
86 } catch (IOException ioe) {
87 logger.log(
88 Level.SEVERE,
89 "Unable to initialize persisted environment from "
90 + preloadSource + " - proceeding without persisted environment!",
91 ioe);
92 } catch(DatabaseException de) {
93 logger.log(
94 Level.SEVERE,
95 "Unable to initialize persisted environment from "
96 + preloadSource + " - proceeding without persisted environment!",
97 de);
98 }
99 }
100 return historyMap;
101 }
102
103
104
105 @Override
106 protected void innerProcess(CrawlURI curi) throws InterruptedException {
107 if(shouldLoad(curi)) {
108 AList prior = (AList) store.get(persistKeyFor(curi));
109 if(prior!=null) {
110
111 Iterator iter = prior.getKeys();
112 curi.getAList().copyKeysFrom(iter,prior);
113 }
114 }
115 }
116 }