View Javadoc

1   /* PersistProcessor.java
2    * 
3    * Created on Feb 17, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.IOException;
28  import java.io.UnsupportedEncodingException;
29  import java.net.MalformedURLException;
30  import java.net.URL;
31  import java.util.Iterator;
32  import java.util.Map.Entry;
33  import java.util.logging.ConsoleHandler;
34  import java.util.logging.Handler;
35  import java.util.logging.Level;
36  import java.util.logging.Logger;
37  
38  import org.apache.commons.codec.binary.Base64;
39  import org.apache.commons.io.IOUtils;
40  import org.archive.crawler.datamodel.CrawlURI;
41  import org.archive.crawler.framework.Processor;
42  import org.archive.crawler.io.CrawlerJournal;
43  import org.archive.util.FileUtils;
44  import org.archive.util.IoUtils;
45  import org.archive.util.OneLineSimpleLogger;
46  import org.archive.util.SURT;
47  import org.archive.util.bdbje.EnhancedEnvironment;
48  import org.archive.util.iterator.LineReadingIterator;
49  
50  import st.ata.util.AList;
51  
52  import com.sleepycat.bind.serial.SerialBinding;
53  import com.sleepycat.bind.serial.StoredClassCatalog;
54  import com.sleepycat.bind.tuple.StringBinding;
55  import com.sleepycat.collections.StoredIterator;
56  import com.sleepycat.collections.StoredSortedMap;
57  import com.sleepycat.je.Database;
58  import com.sleepycat.je.DatabaseConfig;
59  import com.sleepycat.je.DatabaseException;
60  import com.sleepycat.je.EnvironmentConfig;
61  
62  /***
63   * Superclass for Processors which utilize BDB-JE for URI state
64   * (including most notably history) persistence.
65   * 
66   * @author gojomo
67   */
68  public abstract class PersistProcessor extends Processor {
69      
70      private static final long serialVersionUID = 1L;
71  
72      private static final Logger logger =
73          Logger.getLogger(PersistProcessor.class.getName());
74  
75      /*** name of history Database */
76      public static final String URI_HISTORY_DBNAME = "uri_history";
77      
78      /***
79       * @return DatabaseConfig for history Database
80       */
81      protected static DatabaseConfig historyDatabaseConfig() {
82          DatabaseConfig dbConfig = new DatabaseConfig();
83          dbConfig.setTransactional(false);
84          dbConfig.setAllowCreate(true);
85          dbConfig.setDeferredWrite(true);
86          return dbConfig;
87      }
88  
89      /***
90       * Usual constructor
91       * 
92       * @param name
93       * @param string
94       */
95      public PersistProcessor(String name, String string) {
96          super(name,string);
97      }
98  
99      /***
100      * Return a preferred String key for persisting the given CrawlURI's
101      * AList state. 
102      * 
103      * @param curi CrawlURI
104      * @return String key
105      */
106     public String persistKeyFor(CrawlURI curi) {
107         // use a case-sensitive SURT for uniqueness and sorting benefits
108         return SURT.fromURI(curi.getUURI().toString(),true);
109     }
110 
111     /***
112      * Whether the current CrawlURI's state should be persisted (to log or
113      * direct to database)
114      * 
115      * @param curi CrawlURI
116      * @return true if state should be stored; false to skip persistence
117      */
118     protected boolean shouldStore(CrawlURI curi) {
119         // TODO: don't store some codes, such as 304 unchanged?
120         return curi.isSuccess();
121     }
122 
123     /***
124      * Whether the current CrawlURI's state should be loaded
125      * 
126      * @param curi CrawlURI
127      * @return true if state should be loaded; false to skip loading
128      */
129     protected boolean shouldLoad(CrawlURI curi) {
130         // TODO: don't load some (prereqs?)
131         return true;
132     }
133 
134     /***
135      * Copies entries from an existing environment db to a new one. If
136      * historyMap is not provided, only logs the entries that would have been 
137      * copied.
138      * 
139      * @param sourceDir existing environment database directory
140      * @param historyMap new environment db (or null for a dry run)
141      * @return number of records
142      * @throws DatabaseException
143      */
144     private static int copyPersistEnv(File sourceDir, StoredSortedMap<String,AList> historyMap) 
145     throws DatabaseException {
146         int count = 0;
147 
148         // open the source env history DB, copying entries to target env
149         EnhancedEnvironment sourceEnv = setupEnvironment(sourceDir);
150         StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
151         Database sourceHistoryDB = sourceEnv.openDatabase(
152                 null, URI_HISTORY_DBNAME, historyDatabaseConfig());
153         StoredSortedMap<String,AList> sourceHistoryMap = new StoredSortedMap<String,AList>(sourceHistoryDB,
154                 new StringBinding(), new SerialBinding<AList>(sourceClassCatalog,
155                         AList.class), true);
156         Iterator<Entry<String,AList>> iter = sourceHistoryMap.entrySet().iterator();
157         while (iter.hasNext()) {
158             Entry<String,AList> item = iter.next(); 
159             logger.fine(item.getKey() + " " + item.getValue().toPrettyString());
160             if (historyMap != null) {
161                 historyMap.put(item.getKey(), item.getValue());
162             }
163             count++;
164         }
165         StoredIterator.close(iter);
166         sourceHistoryDB.close();
167         sourceEnv.close();
168         
169         return count;
170     }
171 
172     /***
173      * Populates an environment db from a persist log. If historyMap is
174      * not provided, only logs the entries that would have been populated.
175      * 
176      * @param persistLogReader
177      *            persist log
178      * @param historyMap
179      *            new environment db (or null for a dry run)
180      * @return number of records
181      * @throws UnsupportedEncodingException
182      * @throws DatabaseException
183      */
184     private static int populatePersistEnvFromLog(BufferedReader persistLogReader, StoredSortedMap<String,AList> historyMap) 
185     throws UnsupportedEncodingException, DatabaseException {
186         int count = 0;
187         
188         Iterator<String> iter = new LineReadingIterator(persistLogReader);
189         while (iter.hasNext()) {
190             String line = iter.next(); 
191             if (line.length() == 0) {
192                 continue;
193             }
194             String[] splits = line.split(" ");
195             if (splits.length != 2) {
196                 logger.severe("bad line: " + line);
197                 continue;
198             }
199             try {
200                 AList alist = (AList) IoUtils.deserializeFromByteArray(Base64.decodeBase64(splits[1].getBytes("UTF8")));
201                 logger.fine(splits[0] + " " + alist.toPrettyString());
202                 if (historyMap != null) {
203                     historyMap.put(splits[0], alist);
204                 }
205             } catch (RuntimeException e) {
206                 logger.log(Level.SEVERE, "problem with line: " + line, e);
207             }
208             count++;
209         }
210         IOUtils.closeQuietly(persistLogReader);
211         
212         return count;
213     }
214     
215     /***
216      * Populates a new environment db from an old environment db or a persist
217      * log. If path to new environment is not provided, only logs the entries 
218      * that would have been populated.
219      * 
220      * @param sourcePath
221      *            source of old entries: can be a path to an existing
222      *            environment db, or a URL or path to a persist log
223      * @param envFile
224      *            path to new environment db (or null for a dry run)
225      * @return number of records
226      * @throws DatabaseException
227      * @throws IOException
228      */
229     public static int populatePersistEnv(String sourcePath, File envFile)
230         throws DatabaseException, IOException {
231         int count = 0;
232         StoredSortedMap<String,AList> historyMap = null;
233         EnhancedEnvironment targetEnv = null;
234         StoredClassCatalog classCatalog = null;
235         Database historyDB = null;
236 
237         if (envFile != null) {
238             // set up target environment
239             if (!envFile.exists()) {
240                 envFile.mkdirs();
241             }
242             targetEnv = setupEnvironment(envFile);
243             classCatalog = targetEnv.getClassCatalog();
244             historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME, 
245                     historyDatabaseConfig());
246             historyMap = new StoredSortedMap<String,AList>(historyDB, 
247                     new StringBinding(), new SerialBinding<AList>(classCatalog,
248                         AList.class), true);
249         }
250 
251         try {
252             count = copyPersistSourceToHistoryMap(null, sourcePath, historyMap);
253         } finally {
254             // in finally block so that we unlock the target env even if we
255             // failed to populate it
256             if (envFile != null) {
257                 logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);
258                 historyDB.sync();
259                 historyDB.close();
260                 targetEnv.close();
261             } else {
262                 logger.info(count + " records found in " + sourcePath);
263             }
264         }
265 
266         return count;
267     }
268 
269     /***
270      * Populates a given StoredSortedMap (history map) from an old 
271      * environment db or a persist log. If a map is not provided, only 
272      * logs the entries that would have been populated.
273      * 
274      * @param sourcePath
275      *            source of old entries: can be a path to an existing
276      *            environment db, or a URL or path to a persist log
277      * @param historyMap
278      *            map to populate (or null for a dry run)
279      * @return number of records
280      * @throws DatabaseException
281      * @throws IOException
282      */
283     public static int copyPersistSourceToHistoryMap(File context,
284             String sourcePath,
285             StoredSortedMap<String, AList> historyMap)
286             throws DatabaseException, IOException, MalformedURLException,
287             UnsupportedEncodingException {
288         int count;
289         // delegate depending on the source
290         File sourceFile = FileUtils.maybeRelative(context, sourcePath);
291         if (sourceFile.isDirectory()) {
292             count = copyPersistEnv(sourceFile, historyMap);
293         } else {
294             BufferedReader persistLogReader = null;
295             if (sourceFile.isFile()) {
296                 persistLogReader = CrawlerJournal.getBufferedReader(sourceFile);
297             } else {
298                 URL sourceUrl = new URL(sourcePath);
299                 persistLogReader = CrawlerJournal.getBufferedReader(sourceUrl);
300             }
301             count = populatePersistEnvFromLog(persistLogReader, historyMap);
302         }
303         return count;
304     }
305 
306     /***
307      * Utility main for importing a log into a BDB-JE environment or moving a
308      * database between environments (2 arguments), or simply dumping a log
309      * to stderr in a more readable format (1 argument). 
310      * 
311      * @param args command-line arguments
312      * @throws DatabaseException
313      * @throws IOException
314      */
315     public static void main(String[] args) throws DatabaseException, IOException {
316         Handler handler = new ConsoleHandler();
317         handler.setLevel(Level.ALL);
318         handler.setFormatter(new OneLineSimpleLogger());
319         logger.addHandler(handler);
320         logger.setUseParentHandlers(false);
321 
322         if (args.length == 2) {
323             logger.setLevel(Level.INFO);
324             populatePersistEnv(args[0], new File(args[1]));
325         } else if (args.length == 1) {
326             logger.setLevel(Level.FINE);
327             populatePersistEnv(args[0], null);
328         } else {
329             System.out.println("Arguments: ");
330             System.out.println("    source [target]");
331             System.out.println(
332                 "...where source is either a txtser log file or BDB env dir");
333             System.out.println(
334                 "and target, if present, is a BDB env dir. ");
335             return;
336         }
337     }
338 
339     private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {
340         EnvironmentConfig envConfig = new EnvironmentConfig();
341         envConfig.setAllowCreate(true);
342         return new EnhancedEnvironment(env, envConfig);
343     }
344 }