package org.RI.P2; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.ParseException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.google.gson.Gson; public class Indexer { IndexWriter index; String folderPath; List files; PerFieldAnalyzerWrapper customAnalyzer; Indexer(String folderPath) throws IOException, ParseException { this.folderPath = folderPath; files = readFiles(); customAnalyzer = createAnalyzer(); } PerFieldAnalyzerWrapper createAnalyzer() { Map analyzerPerField = new HashMap<>(); analyzerPerField.put("title", new EnglishAnalyzer()); analyzerPerField.put("abstract", new EnglishAnalyzer()); PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), analyzerPerField); return customAnalyzer; } List readFiles() throws IOException { List files = Files.walk(Paths.get(folderPath)).filter(Files::isRegularFile).map(Path::toFile) .collect(Collectors.toList()); return files; } Paper parseJSONFile(File file) throws IOException { InputStream jsonFile = new FileInputStream(file); Reader readerJson = new InputStreamReader(jsonFile); Gson gson = new Gson(); Paper data = gson.fromJson(readerJson, Paper.class); return data; } void createIndex() throws IOException { Directory dir = FSDirectory.open(Paths.get(folderPath)); IndexWriterConfig config = new IndexWriterConfig(customAnalyzer); config.setOpenMode(OpenMode.CREATE); index = new IndexWriter(dir, config); } void addDocument(Paper paper) throws IOException { Document doc = new Document(); doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES)); doc.add(new TextField("title", paper.metadata.title, Field.Store.YES)); for (Author author : paper.metadata.authors) { String authorName = author.first + " " + author.middle + " " + author.last; authorName = authorName.replaceAll("\\p{P}", ""); doc.add(new TextField("authors", authorName, Field.Store.YES)); } index.addDocument(doc); } void commitChanges() throws IOException { index.commit(); index.close(); } void populateIndex() throws IOException, ParseException { createIndex(); for (File file : files) { Paper paper = parseJSONFile(file); addDocument(paper); } commitChanges(); } private static void usage() { System.out.println("Usage: Indexer "); System.exit(1); } public static void main(String[] args) throws ParseException, IOException { if (args.length != 1) { usage(); } String dataDirectory = args[0]; Indexer indexer = new Indexer(dataDirectory); indexer.populateIndex(); } }