Read all files from a directory

This commit is contained in:
2020-10-25 20:24:59 +01:00
parent 0620f42fa4
commit 68efc348bd
4 changed files with 36 additions and 13 deletions

View File

@@ -0,0 +1,32 @@
package org.RI.P1;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.tika.exception.TikaException;
import org.xml.sax.SAXException;
public class AnalyzeDirectory {
private static List<File> files;
private static void readFiles(String directory) throws IOException {
files = Files.walk(Paths.get(directory)).filter(Files::isRegularFile).map(Path::toFile)
.collect(Collectors.toList());
}
public static void main(String[] args) throws IOException, TikaException, SAXException {
String directory = args[0];
readFiles(directory);
for (File file : files) {
FileData data = new FileData(file);
System.out.println(data);
System.out.println("--------------------------------------------------------------------------");
}
}
}

View File

@@ -30,13 +30,14 @@ public class FileData {
FileData() {
}
FileData(File file) throws FileNotFoundException {
FileData(File file) throws IOException, TikaException, SAXException {
inputStream = new FileInputStream(file);
metadata = new Metadata();
parser = new AutoDetectParser();
contentHandler = new BodyContentHandler(-1);
parseContext = new ParseContext();
langIdentifier = new OptimaizeLangDetector().loadModels();
setMetadata();
}
private void setMetadata() throws IOException, TikaException, SAXException {
@@ -53,14 +54,4 @@ public class FileData {
+ language.getLanguage() + "\n";
}
public static void main(String[] args) throws IOException, TikaException, SAXException {
try {
File file = new File(args[0]);
FileData data = new FileData(file);
data.setMetadata();
System.out.println(data);
} catch (FileNotFoundException exp) {
System.out.println("The file " + args[0] + " could not be found");
}
}
}

View File

@@ -1,10 +0,0 @@
* P1
** TODO Extract all URLs
** TODO Write to a file all word occurrences and frequencies
Sorted in a decreasing manner
** TODO Plot word frequencies
With gnuplot, with documents of at least 3 different languages.
We'll fit this to the Booth and Federowicz equation
** DONE Create a table with information of all documents
CLOSED: [2020-10-25 Sun 19:58]
| filename | type | encoding | language |