[Schmitzm-commits] r1868 - trunk/schmitzm-core/src/main/java/de/schmitzm/io

Wed Feb 22 14:06:10 CET 2012

Author: mojays
Date: 2012-02-22 14:06:10 +0100 (Wed, 22 Feb 2012)
New Revision: 1868

Modified:
   trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java
Log:
IOUtil: new methods extractLinksFromURL(.) and listFilesFromURL(.)

Modified: trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java
===================================================================

--- trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java	2012-02-11 18:10:59 UTC (rev 1867)
+++ trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java	2012-02-22 13:06:10 UTC (rev 1868)
@@ -56,7 +56,9 @@
 import java.nio.charset.Charset;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
 import java.util.Enumeration;
+import java.util.List;
 import java.util.Vector;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -1168,7 +1170,57 @@
       return count;
 	}
 	
+	
 	/**
+	 * Extracts all links (href tags) from website document.
+	 * @param url URL of a website
+	 */
+	public static List<String> extractLinksFromURL(URL url) throws IOException {
+	  // load document from URL
+	  String str = IOUtil.convertStreamToString(url.openStream());
+
+	  // create matcher to extract "href" tags from document
+	  final String  regEx = "href=\"([^\"]*)\"";
+	  final Pattern p     = Pattern.compile(regEx, Pattern.DOTALL);
+	  
+	  Matcher m = p.matcher(str);
+      // extract all "href" tags from document
+	  ArrayList<String> links = new ArrayList<String>();
+	  for (;m.find();) {
+	    String hrefTag = m.group();
+	    // tag still includes 'href="..."'
+	    // -> extract part between quotation marks
+	    String  link = hrefTag.split("\"")[1];
+	    links.add(link);
+	  }
+	  
+	  return links;
+	}
+	
+    /**
+     * Determines all files from an web directory. Because the files can not (yet)
+     * be determines automatically this method simply extracts all html {@code href} tags from
+     * the URL stream! So be sure that the given URL specifies a "directory" on the
+     * web server without any index.html (or something like that).
+     * Otherwise (e.g. if URL specifies a website) this method might have an unexpected result!
+     * @param url URL of a webserver directory
+     */
+    public static List<URL> listFilesFromURL(URL url) throws IOException {
+      // #### TODO: improve this workaround method! ####
+
+      // extract links from URL
+      List<String> fileNames = extractLinksFromURL(url);
+
+      // combine file names with source URL
+      List<URL> files = new ArrayList<URL>();
+      for (String fileName : fileNames) {
+        URL fileURL = new URL(url,fileName);
+        files.add(fileURL);
+      }
+      return files;
+    }
+
+    /**
 	 * Copy file/folder to file/folder but doesn't throw an Exception
 	 * 
 	 * @param source
@@ -1660,5 +1712,6 @@
 		PrintWriter writer = new PrintWriter(out, true);
 		return new CharacterDevice(reader, writer);
 	}
+	
 
 }