package org.RI.P2; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.ParseException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.google.gson.Gson; public class Indexer { IndexWriter index; String dataFolderPath; String indexFolderPath; List files; PerFieldAnalyzerWrapper customAnalyzer; Indexer(String dataFolderPath, String indexFolderPath) throws IOException, ParseException { this.dataFolderPath = dataFolderPath; this.indexFolderPath = indexFolderPath; files = readFiles(); customAnalyzer = createAnalyzer(); } PerFieldAnalyzerWrapper createAnalyzer() { Map analyzerPerField = new HashMap<>(); analyzerPerField.put("document_id", new StandardAnalyzer()); analyzerPerField.put("title", new EnglishAnalyzer()); analyzerPerField.put("abstract", new EnglishAnalyzer()); analyzerPerField.put("authors", new StandardAnalyzer()); analyzerPerField.put("institutions", new StandardAnalyzer()); analyzerPerField.put("emails", new StandardAnalyzer()); PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), analyzerPerField); return customAnalyzer; } List readFiles() throws IOException { List files = Files.walk(Paths.get(dataFolderPath)).filter(Files::isRegularFile).map(Path::toFile) .collect(Collectors.toList()); return files; } Paper parseJSONFile(File file) throws IOException { InputStream jsonFile = new FileInputStream(file); Reader readerJson = new InputStreamReader(jsonFile); Gson gson = new Gson(); Paper data = gson.fromJson(readerJson, Paper.class); return data; } void createIndex() throws IOException { Directory dir = FSDirectory.open(Paths.get(indexFolderPath)); IndexWriterConfig config = new IndexWriterConfig(customAnalyzer); config.setOpenMode(OpenMode.CREATE); index = new IndexWriter(dir, config); } void addDocument(Paper paper) throws IOException { Document doc = new Document(); doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES)); doc.add(new TextField("title", paper.metadata.title, Field.Store.YES)); StringBuilder authors = new StringBuilder(); StringBuilder institutions = new StringBuilder(); StringBuilder emails = new StringBuilder(); for (Author author : paper.metadata.authors) { String authorName = author.first + " " + author.middle + " " + author.last; authorName = authorName.replaceAll("\\p{P}", ""); authors.append(authorName); institutions.append(author.affiliation.institution); emails.append(author.email); } doc.add(new TextField("authors", authors.toString(), Field.Store.YES)); doc.add(new TextField("institution", institutions.toString(), Field.Store.NO)); doc.add(new TextField("emails", emails.toString(), Field.Store.NO)); StringBuilder fullAbstract = new StringBuilder(); for (Abstract abstr : paper.abstr) { fullAbstract.append(abstr.text); } doc.add(new TextField("abstract", fullAbstract.toString(), Field.Store.NO)); index.addDocument(doc); } void commitChanges() throws IOException { index.commit(); index.close(); } void populateIndex() throws IOException, ParseException { createIndex(); for (File file : files) { Paper paper = parseJSONFile(file); if (paper != null) { addDocument(paper); } } commitChanges(); } private static void usage() { System.out.println("Usage: Indexer "); System.exit(1); } public static void main(String[] args) throws ParseException, IOException { if (args.length != 1) { usage(); } String dataDirectory = args[0]; Indexer indexer = new Indexer(dataDirectory, ".index"); indexer.populateIndex(); } }