RI-P2/src/main/java/org/RI/P2/Indexer.java
2021-01-11 20:09:47 +01:00

137 lines
5.1 KiB
Java

package org.RI.P2;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.google.gson.Gson;
public class Indexer {
IndexWriter index;
String dataFolderPath;
String indexFolderPath;
List<File> files;
PerFieldAnalyzerWrapper customAnalyzer;
Indexer(String dataFolderPath, String indexFolderPath) throws IOException, ParseException {
this.dataFolderPath = dataFolderPath;
this.indexFolderPath = indexFolderPath;
files = readFiles();
customAnalyzer = createAnalyzer();
}
PerFieldAnalyzerWrapper createAnalyzer() {
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("document_id", new StandardAnalyzer());
analyzerPerField.put("title", new EnglishAnalyzer());
analyzerPerField.put("abstract", new EnglishAnalyzer());
analyzerPerField.put("authors", new StandardAnalyzer());
analyzerPerField.put("institutions", new StandardAnalyzer());
analyzerPerField.put("emails", new StandardAnalyzer());
PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(),
analyzerPerField);
return customAnalyzer;
}
List<File> readFiles() throws IOException {
List<File> files = Files.walk(Paths.get(dataFolderPath)).filter(Files::isRegularFile).map(Path::toFile)
.collect(Collectors.toList());
return files;
}
Paper parseJSONFile(File file) throws IOException {
InputStream jsonFile = new FileInputStream(file);
Reader readerJson = new InputStreamReader(jsonFile);
Gson gson = new Gson();
Paper data = gson.fromJson(readerJson, Paper.class);
return data;
}
void createIndex() throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexFolderPath));
IndexWriterConfig config = new IndexWriterConfig(customAnalyzer);
config.setOpenMode(OpenMode.CREATE);
index = new IndexWriter(dir, config);
}
void addDocument(Paper paper) throws IOException {
Document doc = new Document();
doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES));
doc.add(new TextField("title", paper.metadata.title, Field.Store.YES));
StringBuilder authors = new StringBuilder();
StringBuilder institutions = new StringBuilder();
StringBuilder emails = new StringBuilder();
for (Author author : paper.metadata.authors) {
String authorName = author.first + " " + author.middle + " " + author.last;
authorName = authorName.replaceAll("\\p{P}", "");
authors.append(authorName);
institutions.append(author.affiliation.institution);
emails.append(author.email);
}
doc.add(new TextField("authors", authors.toString(), Field.Store.YES));
doc.add(new TextField("institution", institutions.toString(), Field.Store.NO));
doc.add(new TextField("emails", emails.toString(), Field.Store.NO));
StringBuilder fullAbstract = new StringBuilder();
for (Abstract abstr : paper.abstr) {
fullAbstract.append(abstr.text);
}
doc.add(new TextField("abstract", fullAbstract.toString(), Field.Store.NO));
index.addDocument(doc);
}
void commitChanges() throws IOException {
index.commit();
index.close();
}
void populateIndex() throws IOException, ParseException {
createIndex();
for (File file : files) {
Paper paper = parseJSONFile(file);
if (paper != null) {
addDocument(paper);
}
}
commitChanges();
}
private static void usage() {
System.out.println("Usage: Indexer <directory>");
System.exit(1);
}
public static void main(String[] args) throws ParseException, IOException {
if (args.length != 1) {
usage();
}
String dataDirectory = args[0];
Indexer indexer = new Indexer(dataDirectory, ".index");
indexer.populateIndex();
}
}