137 lines
5.1 KiB
Java
137 lines
5.1 KiB
Java
package org.RI.P2;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.Reader;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.Paths;
|
|
import java.text.ParseException;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
|
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
|
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
import org.apache.lucene.document.Document;
|
|
import org.apache.lucene.document.Field;
|
|
import org.apache.lucene.document.StringField;
|
|
import org.apache.lucene.document.TextField;
|
|
import org.apache.lucene.index.IndexWriter;
|
|
import org.apache.lucene.index.IndexWriterConfig;
|
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
|
import org.apache.lucene.store.Directory;
|
|
import org.apache.lucene.store.FSDirectory;
|
|
import com.google.gson.Gson;
|
|
|
|
public class Indexer {
|
|
IndexWriter index;
|
|
String dataFolderPath;
|
|
String indexFolderPath;
|
|
List<File> files;
|
|
PerFieldAnalyzerWrapper customAnalyzer;
|
|
|
|
Indexer(String dataFolderPath, String indexFolderPath) throws IOException, ParseException {
|
|
this.dataFolderPath = dataFolderPath;
|
|
this.indexFolderPath = indexFolderPath;
|
|
files = readFiles();
|
|
customAnalyzer = createAnalyzer();
|
|
}
|
|
|
|
PerFieldAnalyzerWrapper createAnalyzer() {
|
|
Map<String, Analyzer> analyzerPerField = new HashMap<>();
|
|
analyzerPerField.put("document_id", new StandardAnalyzer());
|
|
analyzerPerField.put("title", new EnglishAnalyzer());
|
|
analyzerPerField.put("abstract", new EnglishAnalyzer());
|
|
analyzerPerField.put("authors", new StandardAnalyzer());
|
|
analyzerPerField.put("institutions", new StandardAnalyzer());
|
|
analyzerPerField.put("emails", new StandardAnalyzer());
|
|
PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(),
|
|
analyzerPerField);
|
|
return customAnalyzer;
|
|
}
|
|
|
|
List<File> readFiles() throws IOException {
|
|
List<File> files = Files.walk(Paths.get(dataFolderPath)).filter(Files::isRegularFile).map(Path::toFile)
|
|
.collect(Collectors.toList());
|
|
return files;
|
|
}
|
|
|
|
Paper parseJSONFile(File file) throws IOException {
|
|
InputStream jsonFile = new FileInputStream(file);
|
|
Reader readerJson = new InputStreamReader(jsonFile);
|
|
Gson gson = new Gson();
|
|
Paper data = gson.fromJson(readerJson, Paper.class);
|
|
return data;
|
|
}
|
|
|
|
void createIndex() throws IOException {
|
|
Directory dir = FSDirectory.open(Paths.get(indexFolderPath));
|
|
IndexWriterConfig config = new IndexWriterConfig(customAnalyzer);
|
|
config.setOpenMode(OpenMode.CREATE);
|
|
index = new IndexWriter(dir, config);
|
|
}
|
|
|
|
void addDocument(Paper paper) throws IOException {
|
|
Document doc = new Document();
|
|
doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES));
|
|
doc.add(new TextField("title", paper.metadata.title, Field.Store.YES));
|
|
StringBuilder authors = new StringBuilder();
|
|
StringBuilder institutions = new StringBuilder();
|
|
StringBuilder emails = new StringBuilder();
|
|
for (Author author : paper.metadata.authors) {
|
|
String authorName = author.first + " " + author.middle + " " + author.last;
|
|
authorName = authorName.replaceAll("\\p{P}", "");
|
|
authors.append(authorName);
|
|
institutions.append(author.affiliation.institution);
|
|
emails.append(author.email);
|
|
}
|
|
doc.add(new TextField("authors", authors.toString(), Field.Store.YES));
|
|
doc.add(new TextField("institution", institutions.toString(), Field.Store.NO));
|
|
doc.add(new TextField("emails", emails.toString(), Field.Store.NO));
|
|
StringBuilder fullAbstract = new StringBuilder();
|
|
for (Abstract abstr : paper.abstr) {
|
|
fullAbstract.append(abstr.text);
|
|
}
|
|
doc.add(new TextField("abstract", fullAbstract.toString(), Field.Store.NO));
|
|
index.addDocument(doc);
|
|
}
|
|
|
|
void commitChanges() throws IOException {
|
|
index.commit();
|
|
index.close();
|
|
}
|
|
|
|
void populateIndex() throws IOException, ParseException {
|
|
createIndex();
|
|
for (File file : files) {
|
|
Paper paper = parseJSONFile(file);
|
|
if (paper != null) {
|
|
addDocument(paper);
|
|
}
|
|
}
|
|
commitChanges();
|
|
}
|
|
|
|
private static void usage() {
|
|
System.out.println("Usage: Indexer <directory>");
|
|
System.exit(1);
|
|
}
|
|
|
|
public static void main(String[] args) throws ParseException, IOException {
|
|
if (args.length != 1) {
|
|
usage();
|
|
}
|
|
String dataDirectory = args[0];
|
|
Indexer indexer = new Indexer(dataDirectory, ".index");
|
|
indexer.populateIndex();
|
|
}
|
|
}
|