1   /* PersistLogProcessor.java
2    * 
3    * Created on Feb 18, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  import java.io.File;
26  import java.io.IOException;
27  
28  import org.apache.commons.codec.binary.Base64;
29  import org.archive.crawler.datamodel.CrawlURI;
30  import org.archive.crawler.event.CrawlStatusListener;
31  import org.archive.crawler.io.CrawlerJournal;
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.util.FileUtils;
34  import org.archive.util.IoUtils;
35  
36  
37  
38  /***
39   * Log CrawlURI attributes from latest fetch for consultation by a later 
40   * recrawl. Log must be imported into alternate data structure in order
41   * to be consulted. 
42   * 
43   * @author gojomo
44   * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
45   */
46  public class PersistLogProcessor extends PersistProcessor implements CrawlStatusListener {
47      private static final long serialVersionUID = 1678691994065439346L;
48      
49      protected CrawlerJournal log;
50  
51      /*** setting for log filename */
52      public static final String ATTR_LOG_FILENAME = "log-filename";
53      /*** default log filename */ 
54      public static final String DEFAULT_LOG_FILENAME = "persistlog.txtser.gz";
55      
56      /***
57       * Usual constructor
58       * 
59       * @param name
60       */
61      public PersistLogProcessor(String name) {
62          super(name, "PersistLogProcessor. Logs CrawlURI attributes " +
63                  "from latest fetch for consultation by a later recrawl.");
64          
65          addElementToDefinition(new SimpleType(ATTR_LOG_FILENAME,
66                  "Filename to which to log URI persistence information. " +
67                  "Interpreted relative to job logs directory. " +
68                  "Default is 'persistlog.txtser.gz'. ", 
69                  DEFAULT_LOG_FILENAME));
70      }
71  
72      @Override
73      protected void initialTasks() {
74          // Add this class to crawl state listeners to note checkpoints
75          getController().addCrawlStatusListener(this);
76          try {
77              File logFile = FileUtils.maybeRelative(getController().getLogsDir(),
78                      (String) getUncheckedAttribute(null, ATTR_LOG_FILENAME));
79              log = new CrawlerJournal(logFile);
80          } catch (IOException e) {
81              throw new RuntimeException(e);
82          }
83      }
84      
85      @Override
86      protected void finalTasks() {
87          log.close();
88      }
89  
90      @Override
91      protected void innerProcess(CrawlURI curi) {
92          if(shouldStore(curi)) {
93              log.writeLine(persistKeyFor(curi), " ", new String(Base64.encodeBase64(IoUtils
94                      .serializeToByteArray(curi.getPersistentAList()))));      
95          }
96      }
97  
98      public void crawlCheckpoint(File checkpointDir) throws Exception {
99          // rotate log
100         log.checkpoint(checkpointDir);
101     }
102 
103     public void crawlEnded(String sExitMessage) {
104         // ignored
105         
106     }
107 
108     public void crawlEnding(String sExitMessage) {
109         // ignored
110         
111     }
112 
113     public void crawlPaused(String statusMessage) {
114         // ignored
115         
116     }
117 
118     public void crawlPausing(String statusMessage) {
119         // ignored
120         
121     }
122 
123     public void crawlResuming(String statusMessage) {
124         // ignored
125         
126     }
127 
128     public void crawlStarted(String message) {
129         // ignored
130     }
131 }