1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25 import java.io.BufferedReader;
26 import java.io.File;
27 import java.io.IOException;
28 import java.io.UnsupportedEncodingException;
29 import java.net.MalformedURLException;
30 import java.net.URL;
31 import java.util.Iterator;
32 import java.util.Map.Entry;
33 import java.util.logging.ConsoleHandler;
34 import java.util.logging.Handler;
35 import java.util.logging.Level;
36 import java.util.logging.Logger;
37
38 import org.apache.commons.codec.binary.Base64;
39 import org.apache.commons.io.IOUtils;
40 import org.archive.crawler.datamodel.CrawlURI;
41 import org.archive.crawler.framework.Processor;
42 import org.archive.crawler.io.CrawlerJournal;
43 import org.archive.util.FileUtils;
44 import org.archive.util.IoUtils;
45 import org.archive.util.OneLineSimpleLogger;
46 import org.archive.util.SURT;
47 import org.archive.util.bdbje.EnhancedEnvironment;
48 import org.archive.util.iterator.LineReadingIterator;
49
50 import st.ata.util.AList;
51
52 import com.sleepycat.bind.serial.SerialBinding;
53 import com.sleepycat.bind.serial.StoredClassCatalog;
54 import com.sleepycat.bind.tuple.StringBinding;
55 import com.sleepycat.collections.StoredIterator;
56 import com.sleepycat.collections.StoredSortedMap;
57 import com.sleepycat.je.Database;
58 import com.sleepycat.je.DatabaseConfig;
59 import com.sleepycat.je.DatabaseException;
60 import com.sleepycat.je.EnvironmentConfig;
61
62 /***
63 * Superclass for Processors which utilize BDB-JE for URI state
64 * (including most notably history) persistence.
65 *
66 * @author gojomo
67 */
68 public abstract class PersistProcessor extends Processor {
69
70 private static final long serialVersionUID = 1L;
71
72 private static final Logger logger =
73 Logger.getLogger(PersistProcessor.class.getName());
74
75 /*** name of history Database */
76 public static final String URI_HISTORY_DBNAME = "uri_history";
77
78 /***
79 * @return DatabaseConfig for history Database
80 */
81 protected static DatabaseConfig historyDatabaseConfig() {
82 DatabaseConfig dbConfig = new DatabaseConfig();
83 dbConfig.setTransactional(false);
84 dbConfig.setAllowCreate(true);
85 dbConfig.setDeferredWrite(true);
86 return dbConfig;
87 }
88
89 /***
90 * Usual constructor
91 *
92 * @param name
93 * @param string
94 */
95 public PersistProcessor(String name, String string) {
96 super(name,string);
97 }
98
99 /***
100 * Return a preferred String key for persisting the given CrawlURI's
101 * AList state.
102 *
103 * @param curi CrawlURI
104 * @return String key
105 */
106 public String persistKeyFor(CrawlURI curi) {
107
108 return SURT.fromURI(curi.getUURI().toString(),true);
109 }
110
111 /***
112 * Whether the current CrawlURI's state should be persisted (to log or
113 * direct to database)
114 *
115 * @param curi CrawlURI
116 * @return true if state should be stored; false to skip persistence
117 */
118 protected boolean shouldStore(CrawlURI curi) {
119
120 return curi.isSuccess();
121 }
122
123 /***
124 * Whether the current CrawlURI's state should be loaded
125 *
126 * @param curi CrawlURI
127 * @return true if state should be loaded; false to skip loading
128 */
129 protected boolean shouldLoad(CrawlURI curi) {
130
131 return true;
132 }
133
134 /***
135 * Copies entries from an existing environment db to a new one. If
136 * historyMap is not provided, only logs the entries that would have been
137 * copied.
138 *
139 * @param sourceDir existing environment database directory
140 * @param historyMap new environment db (or null for a dry run)
141 * @return number of records
142 * @throws DatabaseException
143 */
144 private static int copyPersistEnv(File sourceDir, StoredSortedMap<String,AList> historyMap)
145 throws DatabaseException {
146 int count = 0;
147
148
149 EnhancedEnvironment sourceEnv = setupEnvironment(sourceDir);
150 StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
151 Database sourceHistoryDB = sourceEnv.openDatabase(
152 null, URI_HISTORY_DBNAME, historyDatabaseConfig());
153 StoredSortedMap<String,AList> sourceHistoryMap = new StoredSortedMap<String,AList>(sourceHistoryDB,
154 new StringBinding(), new SerialBinding<AList>(sourceClassCatalog,
155 AList.class), true);
156 Iterator<Entry<String,AList>> iter = sourceHistoryMap.entrySet().iterator();
157 while (iter.hasNext()) {
158 Entry<String,AList> item = iter.next();
159 logger.fine(item.getKey() + " " + item.getValue().toPrettyString());
160 if (historyMap != null) {
161 historyMap.put(item.getKey(), item.getValue());
162 }
163 count++;
164 }
165 StoredIterator.close(iter);
166 sourceHistoryDB.close();
167 sourceEnv.close();
168
169 return count;
170 }
171
172 /***
173 * Populates an environment db from a persist log. If historyMap is
174 * not provided, only logs the entries that would have been populated.
175 *
176 * @param persistLogReader
177 * persist log
178 * @param historyMap
179 * new environment db (or null for a dry run)
180 * @return number of records
181 * @throws UnsupportedEncodingException
182 * @throws DatabaseException
183 */
184 private static int populatePersistEnvFromLog(BufferedReader persistLogReader, StoredSortedMap<String,AList> historyMap)
185 throws UnsupportedEncodingException, DatabaseException {
186 int count = 0;
187
188 Iterator<String> iter = new LineReadingIterator(persistLogReader);
189 while (iter.hasNext()) {
190 String line = iter.next();
191 if (line.length() == 0) {
192 continue;
193 }
194 String[] splits = line.split(" ");
195 if (splits.length != 2) {
196 logger.severe("bad line: " + line);
197 continue;
198 }
199 try {
200 AList alist = (AList) IoUtils.deserializeFromByteArray(Base64.decodeBase64(splits[1].getBytes("UTF8")));
201 logger.fine(splits[0] + " " + alist.toPrettyString());
202 if (historyMap != null) {
203 historyMap.put(splits[0], alist);
204 }
205 } catch (RuntimeException e) {
206 logger.log(Level.SEVERE, "problem with line: " + line, e);
207 }
208 count++;
209 }
210 IOUtils.closeQuietly(persistLogReader);
211
212 return count;
213 }
214
215 /***
216 * Populates a new environment db from an old environment db or a persist
217 * log. If path to new environment is not provided, only logs the entries
218 * that would have been populated.
219 *
220 * @param sourcePath
221 * source of old entries: can be a path to an existing
222 * environment db, or a URL or path to a persist log
223 * @param envFile
224 * path to new environment db (or null for a dry run)
225 * @return number of records
226 * @throws DatabaseException
227 * @throws IOException
228 */
229 public static int populatePersistEnv(String sourcePath, File envFile)
230 throws DatabaseException, IOException {
231 int count = 0;
232 StoredSortedMap<String,AList> historyMap = null;
233 EnhancedEnvironment targetEnv = null;
234 StoredClassCatalog classCatalog = null;
235 Database historyDB = null;
236
237 if (envFile != null) {
238
239 if (!envFile.exists()) {
240 envFile.mkdirs();
241 }
242 targetEnv = setupEnvironment(envFile);
243 classCatalog = targetEnv.getClassCatalog();
244 historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME,
245 historyDatabaseConfig());
246 historyMap = new StoredSortedMap<String,AList>(historyDB,
247 new StringBinding(), new SerialBinding<AList>(classCatalog,
248 AList.class), true);
249 }
250
251 try {
252 count = copyPersistSourceToHistoryMap(null, sourcePath, historyMap);
253 } finally {
254
255
256 if (envFile != null) {
257 logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);
258 historyDB.sync();
259 historyDB.close();
260 targetEnv.close();
261 } else {
262 logger.info(count + " records found in " + sourcePath);
263 }
264 }
265
266 return count;
267 }
268
269 /***
270 * Populates a given StoredSortedMap (history map) from an old
271 * environment db or a persist log. If a map is not provided, only
272 * logs the entries that would have been populated.
273 *
274 * @param sourcePath
275 * source of old entries: can be a path to an existing
276 * environment db, or a URL or path to a persist log
277 * @param historyMap
278 * map to populate (or null for a dry run)
279 * @return number of records
280 * @throws DatabaseException
281 * @throws IOException
282 */
283 public static int copyPersistSourceToHistoryMap(File context,
284 String sourcePath,
285 StoredSortedMap<String, AList> historyMap)
286 throws DatabaseException, IOException, MalformedURLException,
287 UnsupportedEncodingException {
288 int count;
289
290 File sourceFile = FileUtils.maybeRelative(context, sourcePath);
291 if (sourceFile.isDirectory()) {
292 count = copyPersistEnv(sourceFile, historyMap);
293 } else {
294 BufferedReader persistLogReader = null;
295 if (sourceFile.isFile()) {
296 persistLogReader = CrawlerJournal.getBufferedReader(sourceFile);
297 } else {
298 URL sourceUrl = new URL(sourcePath);
299 persistLogReader = CrawlerJournal.getBufferedReader(sourceUrl);
300 }
301 count = populatePersistEnvFromLog(persistLogReader, historyMap);
302 }
303 return count;
304 }
305
306 /***
307 * Utility main for importing a log into a BDB-JE environment or moving a
308 * database between environments (2 arguments), or simply dumping a log
309 * to stderr in a more readable format (1 argument).
310 *
311 * @param args command-line arguments
312 * @throws DatabaseException
313 * @throws IOException
314 */
315 public static void main(String[] args) throws DatabaseException, IOException {
316 Handler handler = new ConsoleHandler();
317 handler.setLevel(Level.ALL);
318 handler.setFormatter(new OneLineSimpleLogger());
319 logger.addHandler(handler);
320 logger.setUseParentHandlers(false);
321
322 if (args.length == 2) {
323 logger.setLevel(Level.INFO);
324 populatePersistEnv(args[0], new File(args[1]));
325 } else if (args.length == 1) {
326 logger.setLevel(Level.FINE);
327 populatePersistEnv(args[0], null);
328 } else {
329 System.out.println("Arguments: ");
330 System.out.println(" source [target]");
331 System.out.println(
332 "...where source is either a txtser log file or BDB env dir");
333 System.out.println(
334 "and target, if present, is a BDB env dir. ");
335 return;
336 }
337 }
338
339 private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {
340 EnvironmentConfig envConfig = new EnvironmentConfig();
341 envConfig.setAllowCreate(true);
342 return new EnhancedEnvironment(env, envConfig);
343 }
344 }