Read all files from a directory
This commit is contained in:
32
src/main/java/org/RI/P1/AnalyzeDirectory.java
Normal file
32
src/main/java/org/RI/P1/AnalyzeDirectory.java
Normal file
@@ -0,0 +1,32 @@
|
||||
package org.RI.P1;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class AnalyzeDirectory {
|
||||
private static List<File> files;
|
||||
|
||||
private static void readFiles(String directory) throws IOException {
|
||||
files = Files.walk(Paths.get(directory)).filter(Files::isRegularFile).map(Path::toFile)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException, TikaException, SAXException {
|
||||
String directory = args[0];
|
||||
readFiles(directory);
|
||||
for (File file : files) {
|
||||
FileData data = new FileData(file);
|
||||
System.out.println(data);
|
||||
System.out.println("--------------------------------------------------------------------------");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -30,13 +30,14 @@ public class FileData {
|
||||
FileData() {
|
||||
}
|
||||
|
||||
FileData(File file) throws FileNotFoundException {
|
||||
FileData(File file) throws IOException, TikaException, SAXException {
|
||||
inputStream = new FileInputStream(file);
|
||||
metadata = new Metadata();
|
||||
parser = new AutoDetectParser();
|
||||
contentHandler = new BodyContentHandler(-1);
|
||||
parseContext = new ParseContext();
|
||||
langIdentifier = new OptimaizeLangDetector().loadModels();
|
||||
setMetadata();
|
||||
}
|
||||
|
||||
private void setMetadata() throws IOException, TikaException, SAXException {
|
||||
@@ -53,14 +54,4 @@ public class FileData {
|
||||
+ language.getLanguage() + "\n";
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException, TikaException, SAXException {
|
||||
try {
|
||||
File file = new File(args[0]);
|
||||
FileData data = new FileData(file);
|
||||
data.setMetadata();
|
||||
System.out.println(data);
|
||||
} catch (FileNotFoundException exp) {
|
||||
System.out.println("The file " + args[0] + " could not be found");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
* P1
|
||||
** TODO Extract all URLs
|
||||
** TODO Write to a file all word occurrences and frequencies
|
||||
Sorted in a decreasing manner
|
||||
** TODO Plot word frequencies
|
||||
With gnuplot, with documents of at least 3 different languages.
|
||||
We'll fit this to the Booth and Federowicz equation
|
||||
** DONE Create a table with information of all documents
|
||||
CLOSED: [2020-10-25 Sun 19:58]
|
||||
| filename | type | encoding | language |
|
||||
Reference in New Issue
Block a user