1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.settings;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileNotFoundException;
32 import java.io.FileOutputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.util.ArrayList;
36 import java.util.Collection;
37 import java.util.List;
38 import java.util.TreeSet;
39 import java.util.logging.Logger;
40
41 import javax.management.Attribute;
42 import javax.management.AttributeNotFoundException;
43 import javax.management.InvalidAttributeValueException;
44 import javax.management.MBeanAttributeInfo;
45 import javax.management.MBeanException;
46 import javax.management.MBeanInfo;
47 import javax.management.ReflectionException;
48 import javax.xml.parsers.FactoryConfigurationError;
49 import javax.xml.parsers.ParserConfigurationException;
50 import javax.xml.parsers.SAXParserFactory;
51 import javax.xml.transform.Source;
52 import javax.xml.transform.Transformer;
53 import javax.xml.transform.TransformerFactory;
54 import javax.xml.transform.stream.StreamResult;
55
56 import org.apache.commons.io.IOUtils;
57 import org.archive.crawler.datamodel.CrawlOrder;
58 import org.archive.util.ArchiveUtils;
59 import org.archive.util.FileUtils;
60 import org.xml.sax.InputSource;
61 import org.xml.sax.SAXException;
62 import org.xml.sax.SAXParseException;
63 import org.xml.sax.XMLReader;
64
65 /*** A SettingsHandler which uses XML files as persistent storage.
66 *
67 * @author John Erik Halse
68 */
69 public class XMLSettingsHandler extends SettingsHandler {
70 private static Logger logger =
71 Logger.getLogger(
72 "org.archive.crawler.settings.XMLSettingsHandler");
73
74
75 protected static final String XML_SCHEMA = "heritrix_settings.xsd";
76 protected static final String XML_ROOT_ORDER = "crawl-order";
77 protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
78 protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
79 protected static final String XML_ELEMENT_CONTROLLER = "controller";
80 protected static final String XML_ELEMENT_META = "meta";
81 protected static final String XML_ELEMENT_NAME = "name";
82 protected static final String XML_ELEMENT_DESCRIPTION = "description";
83 protected static final String XML_ELEMENT_OPERATOR = "operator";
84 protected static final String XML_ELEMENT_ORGANIZATION = "organization";
85 protected static final String XML_ELEMENT_AUDIENCE = "audience";
86 protected static final String XML_ELEMENT_DATE = "date";
87 protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
88 protected static final String XML_ELEMENT_REFINEMENT = "refinement";
89 protected static final String XML_ELEMENT_REFERENCE = "reference";
90 protected static final String XML_ELEMENT_LIMITS = "limits";
91 protected static final String XML_ELEMENT_TIMESPAN = "timespan";
92 protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
93 protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
94 protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
95 protected static final String XML_ELEMENT_OBJECT = "object";
96 protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
97 protected static final String XML_ATTRIBUTE_NAME = "name";
98 protected static final String XML_ATTRIBUTE_CLASS = "class";
99 protected static final String XML_ATTRIBUTE_FROM = "from";
100 protected static final String XML_ATTRIBUTE_TO = "to";
101
102 private File orderFile;
103 private final static String settingsFilename = "settings";
104 private final static String settingsFilenameSuffix = "xml";
105 private final static String REFINEMENT_DIR = "_refinements";
106
107 /*** Create a new XMLSettingsHandler object.
108 *
109 * @param orderFile where the order file is located.
110 * @throws InvalidAttributeValueException
111 */
112 public XMLSettingsHandler(File orderFile)
113 throws InvalidAttributeValueException {
114 super();
115 this.orderFile = orderFile.getAbsoluteFile();
116 }
117
118 /*** Initialize the SettingsHandler.
119 *
120 * This method builds the settings data structure and initializes it with
121 * settings from the order file given to the constructor.
122 */
123 public void initialize() {
124 super.initialize();
125 }
126
127 /***
128 * Initialize the SettingsHandler from a source.
129 *
130 * This method builds the settings data structure and initializes it with
131 * settings from the order file given as a parameter. The intended use is
132 * to create a new order file based on a default (template) order file.
133 *
134 * @param source the order file to initialize from.
135 */
136 public void initialize(File source) {
137 File tmpOrderFile = orderFile;
138 orderFile = source.getAbsoluteFile();
139 this.initialize();
140 orderFile = tmpOrderFile;
141 }
142
143 private File getSettingsDirectory() {
144 String settingsDirectoryName = null;
145 try {
146 settingsDirectoryName =
147 (String) getOrder().getAttribute(
148 CrawlOrder.ATTR_SETTINGS_DIRECTORY);
149 } catch (AttributeNotFoundException e) {
150 e.printStackTrace();
151 } catch (MBeanException e) {
152 e.printStackTrace();
153 } catch (ReflectionException e) {
154 e.printStackTrace();
155 }
156
157 return getPathRelativeToWorkingDirectory(settingsDirectoryName);
158 }
159
160 /*** Resolves the filename for a settings object into a file path.
161 *
162 * It will also create the directory structure leading to this file
163 * if it doesn't exist.
164 *
165 * @param settings the settings object to get file path for.
166 * @return the file path for this settings object.
167 */
168 protected final File settingsToFilename(CrawlerSettings settings) {
169 File file;
170
171 if (settings.getScope() == null || settings.getScope().equals("")) {
172 if (settings.isRefinement()) {
173 file = new File(getSettingsDirectory(), File.separatorChar
174 + REFINEMENT_DIR + File.separatorChar
175 + settings.getName() + '.' + settingsFilenameSuffix);
176 } else {
177 file = orderFile;
178 }
179 } else {
180 String elements[] = settings.getScope().split("//.");
181 if (elements.length == 0) {
182 return orderFile;
183 }
184
185 StringBuffer path = new StringBuffer();
186 for (int i = elements.length - 1; i > 0; i--) {
187 path.append(elements[i]);
188 path.append(File.separatorChar);
189 }
190 path.append(elements[0]);
191
192 if (settings.isRefinement()) {
193 file = new File(getSettingsDirectory(), path.toString()
194 + File.separatorChar + REFINEMENT_DIR
195 + File.separatorChar + settings.getName() + '.'
196 + settingsFilenameSuffix);
197 } else {
198 file = new File(getSettingsDirectory(), path.toString()
199 + File.separatorChar + settingsFilename + "."
200 + settingsFilenameSuffix);
201 }
202 }
203 return file;
204 }
205
206 public final void writeSettingsObject(CrawlerSettings settings) {
207 File filename = settingsToFilename(settings);
208 writeSettingsObject(settings, filename);
209 }
210
211 /*** Write a CrawlerSettings object to a specified file.
212 *
213 * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
214 * except that it uses the submitted File object instead of trying to
215 * resolve where the file should be written.
216 *
217 * @param settings the settings object to be serialized.
218 * @param filename the file to which the settings object should be written.
219 */
220 public final void writeSettingsObject(
221 CrawlerSettings settings, File filename) {
222
223 logger.fine("Writing " + filename.getAbsolutePath());
224 filename.getParentFile().mkdirs();
225
226 FileOutputStream fos = null;
227 try {
228 long lastSaved = 0L;
229 File backup = null;
230 if (getOrder().getController() != null && filename.exists()) {
231
232 String name = filename.getName();
233 lastSaved = settings.getLastSavedTime().getTime();
234 name = name.substring(0, name.lastIndexOf('.')) + '_'
235 + ArchiveUtils.get14DigitDate(lastSaved) + "."
236 + settingsFilenameSuffix;
237 backup = new File(filename.getParentFile(), name);
238 FileUtils.copyFiles(filename, backup);
239 }
240
241 fos = new FileOutputStream(filename);
242 StreamResult result =
243 new StreamResult(
244 new BufferedOutputStream(fos));
245 Transformer transformer =
246 TransformerFactory.newInstance().newTransformer();
247 Source source = new CrawlSettingsSAXSource(settings);
248 transformer.transform(source, result);
249
250
251
252
253
254 if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
255 backup.delete();
256 }
257 } catch (Exception e) {
258 e.printStackTrace();
259 } finally {
260 IOUtils.closeQuietly(fos);
261 }
262 }
263
264 /*** Read the CrawlerSettings object from a specific file.
265 *
266 * @param settings the settings object to be updated with data from the
267 * persistent storage.
268 * @param f the file to read from.
269 * @return the updated settings object or null if there was no data for this
270 * in the persistent storage.
271 */
272 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
273 File f) {
274 CrawlerSettings result = null;
275 try {
276 InputStream is = null;
277 if (!f.exists()) {
278
279
280
281
282
283
284 if (!f.getName().startsWith(settingsFilename)) {
285 is = XMLSettingsHandler.class.
286 getResourceAsStream(f.getPath());
287 }
288 } else {
289 is = new FileInputStream(f);
290 }
291 if (is != null) {
292 XMLReader parser = SAXParserFactory.newInstance()
293 .newSAXParser().getXMLReader();
294 InputStream file = new BufferedInputStream(is);
295 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
296 InputSource source = new InputSource(file);
297 source.setSystemId(f.toURL().toExternalForm());
298 parser.parse(source);
299 result = settings;
300 }
301 } catch (SAXParseException e) {
302 logger.warning(e.getMessage() + " in '" + e.getSystemId()
303 + "', line: " + e.getLineNumber() + ", column: "
304 + e.getColumnNumber());
305 } catch (SAXException e) {
306 logger.warning(e.getMessage() + ": "
307 + e.getException().getMessage());
308 } catch (ParserConfigurationException e) {
309 logger.warning(e.getMessage() + ": "
310 + e.getCause().getMessage());
311 } catch (FactoryConfigurationError e) {
312 logger.warning(e.getMessage() + ": "
313 + e.getException().getMessage());
314 } catch (IOException e) {
315 logger.warning("Could not access file '"
316 + f.getAbsolutePath() + "': " + e.getMessage());
317 }
318 return result;
319 }
320
321 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
322 File filename = settingsToFilename(settings);
323 return readSettingsObject(settings, filename);
324 }
325
326 /*** Get the <code>File</code> object pointing to the order file.
327 *
328 * @return File object for the order file.
329 */
330 public File getOrderFile() {
331 return orderFile;
332 }
333
334 /*** Creates a replica of the settings file structure in another directory
335 * (fully recursive, includes all per host settings). The SettingsHandler
336 * will then refer to the new files.
337 *
338 * Observe that this method should only be called after the SettingsHandler
339 * has been initialized.
340 *
341 * @param newOrderFileName where the new order file should be saved.
342 * @param newSettingsDirectory the top level directory of the per host/domain
343 * settings files.
344 * @throws IOException
345 */
346 public void copySettings(File newOrderFileName, String newSettingsDirectory)
347 throws IOException {
348 File oldSettingsDirectory = getSettingsDirectory();
349
350
351 orderFile = newOrderFileName;
352 try {
353 getOrder().setAttribute(
354 new Attribute(
355 CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
356 } catch (Exception e) {
357 throw new IOException("Could not update settings with new location: "
358 + e.getMessage());
359 }
360 writeSettingsObject(getSettingsObject(null));
361
362 File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
363
364
365 if (oldSettingsDirectory.compareTo(newDir) != 0) {
366 FileUtils.copyFiles(oldSettingsDirectory, newDir);
367 }
368 }
369
370 /***
371 * Transforms a relative path so that it is relative to the location of the
372 * order file. If an absolute path is given, it will be returned unchanged.<p>
373 * The location of it's order file is always considered as the 'working'
374 * directory for any given settings.
375 * @param path A relative path to a file (or directory)
376 * @return The same path modified so that it is relative to the file level
377 * location of the order file for the settings handler.
378 */
379 public File getPathRelativeToWorkingDirectory(String path) {
380 File f = new File(path);
381
382
383 if (!f.isAbsolute()) {
384 f = new File(this.getOrderFile().getParent(), path);
385 }
386 return f;
387 }
388
389 public Collection getDomainOverrides(String rootDomain) {
390 File settingsDir = getSettingsDirectory();
391
392
393 ArrayList<String> domains = new ArrayList<String>();
394
395 while(rootDomain != null && rootDomain.length()>0){
396 if(rootDomain.indexOf('.')<0){
397
398 domains.add(rootDomain);
399 break;
400 } else {
401
402 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
403
404 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
405 }
406 }
407
408
409 StringBuffer subDir = new StringBuffer();
410 for(int i=(domains.size()-1) ; i>=0 ; i--){
411 subDir.append(File.separator+domains.get(i));
412 }
413
414 settingsDir = new File(settingsDir.getPath()+subDir);
415 TreeSet<String> confirmedSubDomains = new TreeSet<String>();
416 if(settingsDir.exists()){
417
418 File[] possibleSubDomains = settingsDir.listFiles();
419 for (int i = 0; i < possibleSubDomains.length; i++) {
420 if (possibleSubDomains[i].isDirectory()
421 && isOverride(possibleSubDomains[i])) {
422
423 confirmedSubDomains.add(possibleSubDomains[i].getName());
424 }
425 }
426 }
427 return confirmedSubDomains;
428 }
429
430 /***
431 * Checks if a file is a a 'per host' override or if it's a directory if it
432 * or it's subdirectories contains a 'per host' override file.
433 * @param f The file or directory to check
434 * @return True if the file is an override or it's a directory that contains
435 * such a file.
436 */
437 private boolean isOverride(File f){
438 if(f.isDirectory()){
439
440 File[] subs = f.listFiles();
441 for(int i=0 ; i < subs.length ; i++){
442 if(isOverride(subs[i])){
443
444 return true;
445 }
446 }
447 } else if (f.getName().equals(
448 settingsFilename + "." + settingsFilenameSuffix)) {
449
450 return true;
451 }
452
453 return false;
454 }
455
456 /*** Delete a settings object from persistent storage.
457 *
458 * Deletes the file represented by the submitted settings object. All empty
459 * directories that are parents to the files path are also deleted.
460 *
461 * @param settings the settings object to delete.
462 */
463 public void deleteSettingsObject(CrawlerSettings settings) {
464 super.deleteSettingsObject(settings);
465 File settingsDirectory = getSettingsDirectory();
466 File settingsFile = settingsToFilename(settings);
467
468 if(!settingsFile.delete()) {
469 throw new RuntimeException("Could not delete: "+settingsFile);
470 }
471 settingsFile = settingsFile.getParentFile();
472 while (settingsFile.isDirectory() && settingsFile.list().length == 0
473 && !settingsFile.equals(settingsDirectory)) {
474 if(!settingsFile.delete()) {
475 logger.warning("Could not delete: "+settingsFile);
476 }
477 settingsFile = settingsFile.getParentFile();
478 }
479 }
480
481
482
483
484 public List<String> getListOfAllFiles() {
485 ArrayList<String> list = new ArrayList<String>();
486
487 list.add(getOrderFile().getAbsolutePath());
488
489 if (getSettingsDirectory().exists()) {
490 recursiveFindFiles(getSettingsDirectory(),list);
491 }
492
493 recursiveFindSecondaryFiles(getOrder(),list);
494 return list;
495 }
496
497 /***
498 * Add any files being used by any of the Modules making up the settings to
499 * the list.
500 *
501 * @param mbean A ModuleType to interrogate for files. Any child modules
502 * will be recursively interrogated.
503 * @param list The list to add found files to.
504 */
505 private void recursiveFindSecondaryFiles(ComplexType mbean,
506 ArrayList<String> list) {
507 MBeanInfo info = mbean.getMBeanInfo();
508 MBeanAttributeInfo[] a = info.getAttributes();
509
510 if(mbean instanceof ModuleType){
511 ((ModuleType)mbean).listUsedFiles(list);
512 }
513
514
515 for(int n=0; n<a.length; n++) {
516 if(a[n] == null) {
517
518 } else {
519 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
520 Object currentAttribute;
521 try {
522 currentAttribute = mbean.getAttribute(att.getName());
523 if(currentAttribute instanceof ComplexType) {
524 recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
525 }
526 } catch (AttributeNotFoundException e) {
527
528 e.printStackTrace();
529 } catch (MBeanException e) {
530
531 e.printStackTrace();
532 } catch (ReflectionException e) {
533
534 e.printStackTrace();
535 }
536 }
537 }
538 }
539
540 /***
541 * Starting at the specific directory this method will iterate through all
542 * sub directories and add each file (as absolute name, with path as a
543 * string) to the provided ArrayList. Any file found under the settings
544 * directory with the proper suffix will be considered valid and added to
545 * the list.
546 * @param dir Starting directory
547 * @param list The list to add to
548 */
549 private void recursiveFindFiles(File dir, ArrayList<String> list){
550 File[] subs = dir.listFiles();
551 if (subs != null) {
552 for(int i=0 ; i < subs.length ; i++){
553 if(subs[i].isDirectory()){
554 recursiveFindFiles(subs[i],list);
555 } else {
556 if(subs[i].getName().endsWith(settingsFilenameSuffix)){
557
558 list.add(subs[i].getAbsolutePath());
559 }
560 }
561 }
562 }
563 }
564 }