1   /* XMLSettingsHandler
2    *
3    * $Id: XMLSettingsHandler.java 5864 2008-07-10 21:50:48Z gojomo $
4    *
5    * Created on Dec 18, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.FileNotFoundException;
32  import java.io.FileOutputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.util.ArrayList;
36  import java.util.Collection;
37  import java.util.List;
38  import java.util.TreeSet;
39  import java.util.logging.Logger;
40  
41  import javax.management.Attribute;
42  import javax.management.AttributeNotFoundException;
43  import javax.management.InvalidAttributeValueException;
44  import javax.management.MBeanAttributeInfo;
45  import javax.management.MBeanException;
46  import javax.management.MBeanInfo;
47  import javax.management.ReflectionException;
48  import javax.xml.parsers.FactoryConfigurationError;
49  import javax.xml.parsers.ParserConfigurationException;
50  import javax.xml.parsers.SAXParserFactory;
51  import javax.xml.transform.Source;
52  import javax.xml.transform.Transformer;
53  import javax.xml.transform.TransformerFactory;
54  import javax.xml.transform.stream.StreamResult;
55  
56  import org.apache.commons.io.IOUtils;
57  import org.archive.crawler.datamodel.CrawlOrder;
58  import org.archive.util.ArchiveUtils;
59  import org.archive.util.FileUtils;
60  import org.xml.sax.InputSource;
61  import org.xml.sax.SAXException;
62  import org.xml.sax.SAXParseException;
63  import org.xml.sax.XMLReader;
64  
65  /*** A SettingsHandler which uses XML files as persistent storage.
66   *
67   * @author John Erik Halse
68   */
69  public class XMLSettingsHandler extends SettingsHandler {
70      private static Logger logger =
71          Logger.getLogger(
72              "org.archive.crawler.settings.XMLSettingsHandler");
73  
74      // XML element name constants
75      protected static final String XML_SCHEMA = "heritrix_settings.xsd";
76      protected static final String XML_ROOT_ORDER = "crawl-order";
77      protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
78      protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
79      protected static final String XML_ELEMENT_CONTROLLER = "controller";
80      protected static final String XML_ELEMENT_META = "meta";
81      protected static final String XML_ELEMENT_NAME = "name";
82      protected static final String XML_ELEMENT_DESCRIPTION = "description";
83      protected static final String XML_ELEMENT_OPERATOR = "operator";
84      protected static final String XML_ELEMENT_ORGANIZATION = "organization";
85      protected static final String XML_ELEMENT_AUDIENCE = "audience";
86      protected static final String XML_ELEMENT_DATE = "date";
87      protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
88      protected static final String XML_ELEMENT_REFINEMENT = "refinement";
89      protected static final String XML_ELEMENT_REFERENCE = "reference";
90      protected static final String XML_ELEMENT_LIMITS = "limits";
91      protected static final String XML_ELEMENT_TIMESPAN = "timespan";
92      protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
93      protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
94      protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
95      protected static final String XML_ELEMENT_OBJECT = "object";
96      protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
97      protected static final String XML_ATTRIBUTE_NAME = "name";
98      protected static final String XML_ATTRIBUTE_CLASS = "class";
99      protected static final String XML_ATTRIBUTE_FROM = "from";
100     protected static final String XML_ATTRIBUTE_TO = "to";
101 
102     private File orderFile;
103     private final static String settingsFilename = "settings";
104     private final static String settingsFilenameSuffix = "xml";
105     private final static String REFINEMENT_DIR = "_refinements";
106 
107     /*** Create a new XMLSettingsHandler object.
108      *
109      * @param orderFile where the order file is located.
110      * @throws InvalidAttributeValueException
111      */
112     public XMLSettingsHandler(File orderFile)
113     throws InvalidAttributeValueException {
114         super();
115         this.orderFile = orderFile.getAbsoluteFile();
116     }
117 
118     /*** Initialize the SettingsHandler.
119      *
120      * This method builds the settings data structure and initializes it with
121      * settings from the order file given to the constructor.
122      */
123     public void initialize() {
124         super.initialize();
125     }
126 
127     /*** 
128      * Initialize the SettingsHandler from a source.
129      *
130      * This method builds the settings data structure and initializes it with
131      * settings from the order file given as a parameter. The intended use is
132      * to create a new order file based on a default (template) order file.
133      *
134      * @param source the order file to initialize from.
135      */
136     public void initialize(File source) {
137         File tmpOrderFile = orderFile;
138         orderFile = source.getAbsoluteFile();
139         this.initialize();
140         orderFile = tmpOrderFile;
141     }
142 
143     private File getSettingsDirectory() {
144         String settingsDirectoryName = null;
145         try {
146             settingsDirectoryName =
147                     (String) getOrder().getAttribute(
148                         CrawlOrder.ATTR_SETTINGS_DIRECTORY);
149         } catch (AttributeNotFoundException e) {
150             e.printStackTrace();
151         } catch (MBeanException e) {
152             e.printStackTrace();
153         } catch (ReflectionException e) {
154             e.printStackTrace();
155         }
156 
157         return getPathRelativeToWorkingDirectory(settingsDirectoryName);
158     }
159 
160     /*** Resolves the filename for a settings object into a file path.
161      *
162      * It will also create the directory structure leading to this file
163      * if it doesn't exist.
164      *
165      * @param settings the settings object to get file path for.
166      * @return the file path for this settings object.
167      */
168     protected final File settingsToFilename(CrawlerSettings settings) {
169         File file;
170 
171         if (settings.getScope() == null || settings.getScope().equals("")) {
172             if (settings.isRefinement()) {
173                 file = new File(getSettingsDirectory(), File.separatorChar
174                         + REFINEMENT_DIR + File.separatorChar
175                         + settings.getName() + '.' + settingsFilenameSuffix);
176             } else {
177                 file = orderFile;
178             }
179         } else {
180             String elements[] = settings.getScope().split("//.");
181             if (elements.length == 0) {
182                 return orderFile;
183             }
184 
185             StringBuffer path = new StringBuffer();
186             for (int i = elements.length - 1; i > 0; i--) {
187                 path.append(elements[i]);
188                 path.append(File.separatorChar);
189             }
190             path.append(elements[0]);
191 
192             if (settings.isRefinement()) {
193                 file = new File(getSettingsDirectory(), path.toString()
194                         + File.separatorChar + REFINEMENT_DIR
195                         + File.separatorChar + settings.getName() + '.'
196                         + settingsFilenameSuffix);
197             } else {
198                 file = new File(getSettingsDirectory(), path.toString()
199                         + File.separatorChar + settingsFilename + "."
200                         + settingsFilenameSuffix);
201             }
202         }
203         return file;
204     }
205 
206     public final void writeSettingsObject(CrawlerSettings settings) {
207         File filename = settingsToFilename(settings);
208         writeSettingsObject(settings, filename);
209     }
210 
211     /*** Write a CrawlerSettings object to a specified file.
212      *
213      * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
214      * except that it uses the submitted File object instead of trying to
215      * resolve where the file should be written.
216      *
217      * @param settings the settings object to be serialized.
218      * @param filename the file to which the settings object should be written.
219      */
220     public final void writeSettingsObject(
221             CrawlerSettings settings, File filename) {
222 
223         logger.fine("Writing " + filename.getAbsolutePath());
224         filename.getParentFile().mkdirs();
225 
226         FileOutputStream fos = null;
227         try {
228             long lastSaved = 0L;
229             File backup = null;
230             if (getOrder().getController() != null && filename.exists()) {
231                 // The crawler is running and file exists - make backup first.
232                 String name = filename.getName();
233                 lastSaved = settings.getLastSavedTime().getTime();
234                 name = name.substring(0, name.lastIndexOf('.')) + '_'
235                         + ArchiveUtils.get14DigitDate(lastSaved) + "."
236                         + settingsFilenameSuffix;
237                 backup = new File(filename.getParentFile(), name);
238                 FileUtils.copyFiles(filename, backup);
239             }
240 
241             fos = new FileOutputStream(filename);
242             StreamResult result =
243                 new StreamResult(
244                     new BufferedOutputStream(fos));
245             Transformer transformer =
246                 TransformerFactory.newInstance().newTransformer();
247             Source source = new CrawlSettingsSAXSource(settings);
248             transformer.transform(source, result);
249 
250             // Hack to get rid of unnesessary backupfiles.
251             // What happens is that the WUI often saves settings files
252             // several times during a settings change. This code removes the
253             // last backup file if its no more than 2 minutes old.
254             if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
255                 backup.delete();
256             }
257         } catch (Exception e) {
258             e.printStackTrace();
259         } finally {
260             IOUtils.closeQuietly(fos);
261         }
262     }
263 
264     /*** Read the CrawlerSettings object from a specific file.
265      *
266      * @param settings the settings object to be updated with data from the
267      *                 persistent storage.
268      * @param f the file to read from.
269      * @return the updated settings object or null if there was no data for this
270      *         in the persistent storage.
271      */    
272     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
273             File f) {
274         CrawlerSettings result = null;
275         try {
276             InputStream is = null;
277             if (!f.exists()) {
278                 // Perhaps the file we're looking for is on the CLASSPATH.
279                 // DON'T look on the CLASSPATH for 'settings.xml' files.  The
280                 // look for 'settings.xml' files happens frequently. Not looking
281                 // on classpath for 'settings.xml' is an optimization based on
282                 // ASSUMPTION that there will never be a 'settings.xml' saved
283                 // on classpath.
284                 if (!f.getName().startsWith(settingsFilename)) {
285                     is = XMLSettingsHandler.class.
286                         getResourceAsStream(f.getPath());
287                 }
288             } else {
289                 is = new FileInputStream(f);
290             }
291             if (is != null) {
292                 XMLReader parser = SAXParserFactory.newInstance()
293                     .newSAXParser().getXMLReader();
294                 InputStream file = new BufferedInputStream(is);
295                 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
296                 InputSource source = new InputSource(file);
297                 source.setSystemId(f.toURL().toExternalForm());
298                 parser.parse(source);
299                 result = settings;
300             }
301         } catch (SAXParseException e) {
302             logger.warning(e.getMessage() + " in '" + e.getSystemId()
303                 + "', line: " + e.getLineNumber() + ", column: "
304                 + e.getColumnNumber());
305         } catch (SAXException e) {
306             logger.warning(e.getMessage() + ": "
307                 + e.getException().getMessage());
308         } catch (ParserConfigurationException e) {
309             logger.warning(e.getMessage() + ": "
310                 + e.getCause().getMessage());
311         } catch (FactoryConfigurationError e) {
312             logger.warning(e.getMessage() + ": "
313                 + e.getException().getMessage());
314         } catch (IOException e) {
315             logger.warning("Could not access file '"
316                 + f.getAbsolutePath() + "': " + e.getMessage());
317         }
318         return result;
319     }
320 
321     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
322         File filename = settingsToFilename(settings);
323         return readSettingsObject(settings, filename);
324     }
325 
326     /*** Get the <code>File</code> object pointing to the order file.
327      *
328      * @return File object for the order file.
329      */
330     public File getOrderFile() {
331         return orderFile;
332     }
333 
334     /*** Creates a replica of the settings file structure in another directory
335      * (fully recursive, includes all per host settings). The SettingsHandler
336      * will then refer to the new files.
337      *
338      * Observe that this method should only be called after the SettingsHandler
339      * has been initialized.
340      *
341      * @param newOrderFileName where the new order file should be saved.
342      * @param newSettingsDirectory the top level directory of the per host/domain
343      *                          settings files.
344      * @throws IOException
345      */
346     public void copySettings(File newOrderFileName, String newSettingsDirectory)
347       throws IOException {
348         File oldSettingsDirectory = getSettingsDirectory();
349 
350         // Write new orderfile and point the settingshandler to it
351         orderFile = newOrderFileName;
352         try {
353             getOrder().setAttribute(
354                 new Attribute(
355                     CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
356         } catch (Exception e) {
357             throw new IOException("Could not update settings with new location: "
358                 + e.getMessage());
359         }
360         writeSettingsObject(getSettingsObject(null));
361 
362         File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
363 
364         // Copy the per host files if src and dest directories are different.
365         if (oldSettingsDirectory.compareTo(newDir) != 0) {
366             FileUtils.copyFiles(oldSettingsDirectory, newDir);
367         }
368     }
369 
370     /***
371      * Transforms a relative path so that it is relative to the location of the
372      * order file. If an absolute path is given, it will be returned unchanged.<p>
373      * The location of it's order file is always considered as the 'working'
374      * directory for any given settings.
375      * @param path A relative path to a file (or directory)
376      * @return The same path modified so that it is relative to the file level
377      *         location of the order file for the settings handler.
378      */
379     public File getPathRelativeToWorkingDirectory(String path) {
380         File f = new File(path);
381         // If path is not absolute, set f's directory
382         // relative to the path of the order file
383         if (!f.isAbsolute()) {
384             f = new File(this.getOrderFile().getParent(), path);
385         }
386         return f;
387     }
388 
389     public Collection getDomainOverrides(String rootDomain) {
390         File settingsDir = getSettingsDirectory();
391 
392         //Find the right start directory.
393         ArrayList<String> domains = new ArrayList<String>();
394         //First we deconstruct the rootDomain string
395         while(rootDomain != null && rootDomain.length()>0){
396             if(rootDomain.indexOf('.')<0){
397                 // Last level.
398                 domains.add(rootDomain);
399                 break; //We're done.
400             } else {
401                 // Got more then one level left.
402                 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
403                 // Strip down rootDomain.
404                 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
405             }
406         }
407         //Build up a proper path
408         //Since the domains are right to left, we start at the end of the array.
409         StringBuffer subDir = new StringBuffer();
410         for(int i=(domains.size()-1) ; i>=0 ; i--){
411             subDir.append(File.separator+domains.get(i));
412         }
413         //Then we move to the approprite directory.
414         settingsDir = new File(settingsDir.getPath()+subDir);
415         TreeSet<String> confirmedSubDomains = new TreeSet<String>();
416         if(settingsDir.exists()){
417             // Found our place! Search through it's subdirs.
418             File[] possibleSubDomains = settingsDir.listFiles();
419             for (int i = 0; i < possibleSubDomains.length; i++) {
420                 if (possibleSubDomains[i].isDirectory()
421                     && isOverride(possibleSubDomains[i])) {
422                     // Found one!
423                     confirmedSubDomains.add(possibleSubDomains[i].getName());
424                 }
425             }
426         }
427         return confirmedSubDomains;
428     }
429 
430     /***
431      * Checks if a file is a a 'per host' override or if it's a directory if it
432      * or it's subdirectories  contains a 'per host' override file.
433      * @param f The file or directory to check
434      * @return True if the file is an override or it's a directory that contains
435      *         such a file.
436      */
437     private boolean isOverride(File f){
438         if(f.isDirectory()){
439             // Have a directory, check it's contents.
440             File[] subs = f.listFiles();
441             for(int i=0 ; i < subs.length ; i++){
442                 if(isOverride(subs[i])){
443                     // Found one. Can stop looking.
444                     return true;
445                 }
446             }
447         } else if (f.getName().equals(
448                 settingsFilename + "." + settingsFilenameSuffix)) {
449             // This is an override file (or sure looks like one in any case).
450             return true;
451         }
452         // Didn't find an override.
453         return false;
454     }
455 
456     /*** Delete a settings object from persistent storage.
457      *
458      * Deletes the file represented by the submitted settings object. All empty
459      * directories that are parents to the files path are also deleted.
460      *
461      * @param settings the settings object to delete.
462      */
463     public void deleteSettingsObject(CrawlerSettings settings) {
464         super.deleteSettingsObject(settings);
465         File settingsDirectory = getSettingsDirectory();
466         File settingsFile = settingsToFilename(settings);
467 
468         if(!settingsFile.delete()) {
469             throw new RuntimeException("Could not delete: "+settingsFile);
470         }
471         settingsFile = settingsFile.getParentFile();
472         while (settingsFile.isDirectory() && settingsFile.list().length == 0
473                 && !settingsFile.equals(settingsDirectory)) {
474             if(!settingsFile.delete()) {
475                 logger.warning("Could not delete: "+settingsFile);
476             }
477             settingsFile = settingsFile.getParentFile();
478         }
479     }
480 
481     /* (non-Javadoc)
482      * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
483      */
484     public List<String> getListOfAllFiles() {
485         ArrayList<String> list = new ArrayList<String>();
486         // Add CrawlOrder.
487         list.add(getOrderFile().getAbsolutePath());
488         // Iterate through the entire override hierarchy
489         if (getSettingsDirectory().exists()) {
490             recursiveFindFiles(getSettingsDirectory(),list);
491         }
492         // Get files used by settings modules.
493         recursiveFindSecondaryFiles(getOrder(),list);
494         return list;
495     }
496 
497     /***
498      * Add any files being used by any of the Modules making up the settings to
499      * the list.
500      *
501      * @param mbean A ModuleType to interrogate for files. Any child modules
502      *           will be recursively interrogated.
503      * @param list The list to add found files to.
504      */
505     private void recursiveFindSecondaryFiles(ComplexType mbean, 
506             ArrayList<String> list) {
507         MBeanInfo info = mbean.getMBeanInfo();
508         MBeanAttributeInfo[] a = info.getAttributes();
509         // Interrogate the current module
510         if(mbean instanceof ModuleType){
511             ((ModuleType)mbean).listUsedFiles(list);
512         }
513 
514         // Recursively interrogate all sub modules that are of ModuleType
515         for(int n=0; n<a.length; n++) {
516             if(a[n] == null) {
517                 // Error null attribute.
518             } else {
519                 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
520                 Object currentAttribute;
521                 try {
522                     currentAttribute = mbean.getAttribute(att.getName());
523                     if(currentAttribute instanceof ComplexType) {
524                         recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
525                     }
526                 } catch (AttributeNotFoundException e) {
527                     // TODO Auto-generated catch block
528                     e.printStackTrace();
529                 } catch (MBeanException e) {
530                     // TODO Auto-generated catch block
531                     e.printStackTrace();
532                 } catch (ReflectionException e) {
533                     // TODO Auto-generated catch block
534                     e.printStackTrace();
535                 }
536             }
537         }
538     }
539 
540     /***
541      * Starting at the specific directory this method will iterate through all
542      * sub directories and add each file (as absolute name, with path as a
543      * string) to the provided ArrayList. Any file found under the settings
544      * directory with the proper suffix will be considered valid and added to
545      * the list.
546      * @param dir Starting directory
547      * @param list The list to add to
548      */
549     private void recursiveFindFiles(File dir, ArrayList<String> list){
550         File[] subs = dir.listFiles();
551         if (subs != null) {
552             for(int i=0 ; i < subs.length ; i++){
553                 if(subs[i].isDirectory()){
554                     recursiveFindFiles(subs[i],list);
555                 } else {
556                     if(subs[i].getName().endsWith(settingsFilenameSuffix)){
557                         // Add it to list
558                         list.add(subs[i].getAbsolutePath());
559                     }
560                 }
561             }
562         }
563     }
564 }