1   /* PersistLoadProcessor.java
2    * 
3    * Created on Feb 13, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  
26  import java.io.IOException;
27  import java.util.Iterator;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.apache.commons.lang.StringUtils;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.settings.SimpleType;
34  import org.archive.crawler.settings.Type;
35  
36  import com.sleepycat.collections.StoredSortedMap;
37  import com.sleepycat.je.DatabaseException;
38  
39  import st.ata.util.AList;
40  
41  /***
42   * Store CrawlURI attributes from latest fetch to persistent storage for
43   * consultation by a later recrawl. 
44   * 
45   * @author gojomo
46   * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
47   */
48  public class PersistLoadProcessor extends PersistOnlineProcessor {
49      private static final long serialVersionUID = -1917169316015093131L;
50      private static final Logger logger =
51          Logger.getLogger(PersistLoadProcessor.class.getName());
52      
53      /*** file (log) or directory (state/env) from which to preload history **/
54      public static final String ATTR_PRELOAD_SOURCE = "preload-source";
55  
56      /***
57       * Usual constructor
58       * 
59       * @param name
60       */
61      public PersistLoadProcessor(String name) {
62          super(name, "PersistLoadProcessor. Loads CrawlURI attributes " +
63                  "from a previous crawl for current consultation.");
64          Type e;
65          e = addElementToDefinition(new SimpleType(ATTR_PRELOAD_SOURCE,
66                  "Source for preloaded persist information. This can be " +
67                  "a URL or path to a persist log, or a path to an old " +
68                  "state directory.", ""));
69          e.setOverrideable(false);
70          e.setExpertSetting(false);
71      }
72  
73      
74      
75      @Override
76      protected StoredSortedMap<String,AList> initStore() {
77          StoredSortedMap<String,AList> historyMap = super.initStore();
78          
79          // Preload, if a 'preload-source' file-path/URI/dir-path specified
80          String preloadSource = 
81              (String) getUncheckedAttribute(null, ATTR_PRELOAD_SOURCE);
82          if (StringUtils.isNotBlank(preloadSource)) {
83              try {
84                  PersistProcessor.copyPersistSourceToHistoryMap(
85                          getController().getDisk(), preloadSource, historyMap);
86              } catch (IOException ioe) {
87                  logger.log(
88                          Level.SEVERE, 
89                          "Unable to initialize persisted environment from "
90                              + preloadSource + " - proceeding without persisted environment!",
91                          ioe);
92              } catch(DatabaseException de) {
93                  logger.log(
94                          Level.SEVERE, 
95                          "Unable to initialize persisted environment from "
96                              + preloadSource + " - proceeding without persisted environment!",
97                          de);
98              }
99          }
100         return historyMap;
101     }
102 
103 
104 
105     @Override
106     protected void innerProcess(CrawlURI curi) throws InterruptedException {
107         if(shouldLoad(curi)) {
108             AList prior = (AList) store.get(persistKeyFor(curi));
109             if(prior!=null) {
110                 // merge in keys
111                 Iterator iter = prior.getKeys();
112                 curi.getAList().copyKeysFrom(iter,prior);
113             }
114         }
115     }
116 }