[Schmitzm-commits] r1868 - trunk/schmitzm-core/src/main/java/de/schmitzm/io
scm-commit at wald.intevation.org
scm-commit at wald.intevation.org
Wed Feb 22 14:06:10 CET 2012
Author: mojays
Date: 2012-02-22 14:06:10 +0100 (Wed, 22 Feb 2012)
New Revision: 1868
Modified:
trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java
Log:
IOUtil: new methods extractLinksFromURL(.) and listFilesFromURL(.)
Modified: trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java
===================================================================
--- trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java 2012-02-11 18:10:59 UTC (rev 1867)
+++ trunk/schmitzm-core/src/main/java/de/schmitzm/io/IOUtil.java 2012-02-22 13:06:10 UTC (rev 1868)
@@ -56,7 +56,9 @@
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
import java.util.Enumeration;
+import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -1168,7 +1170,57 @@
return count;
}
+
/**
+ * Extracts all links (href tags) from website document.
+ * @param url URL of a website
+ */
+ public static List<String> extractLinksFromURL(URL url) throws IOException {
+ // load document from URL
+ String str = IOUtil.convertStreamToString(url.openStream());
+
+ // create matcher to extract "href" tags from document
+ final String regEx = "href=\"([^\"]*)\"";
+ final Pattern p = Pattern.compile(regEx, Pattern.DOTALL);
+
+ Matcher m = p.matcher(str);
+ // extract all "href" tags from document
+ ArrayList<String> links = new ArrayList<String>();
+ for (;m.find();) {
+ String hrefTag = m.group();
+ // tag still includes 'href="..."'
+ // -> extract part between quotation marks
+ String link = hrefTag.split("\"")[1];
+ links.add(link);
+ }
+
+ return links;
+ }
+
+ /**
+ * Determines all files from an web directory. Because the files can not (yet)
+ * be determines automatically this method simply extracts all html {@code href} tags from
+ * the URL stream! So be sure that the given URL specifies a "directory" on the
+ * web server without any index.html (or something like that).
+ * Otherwise (e.g. if URL specifies a website) this method might have an unexpected result!
+ * @param url URL of a webserver directory
+ */
+ public static List<URL> listFilesFromURL(URL url) throws IOException {
+ // #### TODO: improve this workaround method! ####
+
+ // extract links from URL
+ List<String> fileNames = extractLinksFromURL(url);
+
+ // combine file names with source URL
+ List<URL> files = new ArrayList<URL>();
+ for (String fileName : fileNames) {
+ URL fileURL = new URL(url,fileName);
+ files.add(fileURL);
+ }
+ return files;
+ }
+
+ /**
* Copy file/folder to file/folder but doesn't throw an Exception
*
* @param source
@@ -1660,5 +1712,6 @@
PrintWriter writer = new PrintWriter(out, true);
return new CharacterDevice(reader, writer);
}
+
}
More information about the Schmitzm-commits
mailing list