Compare commits

..

18 Commits

8 changed files with 539 additions and 26 deletions

2
.gitignore vendored
View File

@@ -1,3 +1,5 @@
data data
data-test
target target
output output
.index

112
docs/Summary.org Normal file
View File

@@ -0,0 +1,112 @@
#+TITLE: Práctica final
#+SUBTITLE: Recuperación de Información
#+AUTHOR: Amin Kasrou Aouam
#+DATE: 2021-01-11
#+PANDOC_OPTIONS: template:~/.pandoc/templates/eisvogel.latex
#+PANDOC_OPTIONS: listings:t
#+PANDOC_OPTIONS: toc:t
#+PANDOC_METADATA: lang=es
#+PANDOC_METADATA: titlepage:t
#+PANDOC_METADATA: listings-no-page-break:t
#+PANDOC_METADATA: toc-own-page:t
#+PANDOC_METADATA: table-use-row-colors:t
#+PANDOC_METADATA: logo:/home/coolneng/Photos/Logos/UGR.png
* Práctica final
En esta práctica, vamos a implementar un buscar de información en una serie de documentos /JSON/ usando /Apache Lucene/.
** /Parsing/
Parseamos los documentos /JSON/ mediante la librería /GSON/, dado que ésta nos ofrece la posibilidad de serializar el documento en una clase Java.
Esta funcionalidad nos facilita la extracción de información en /JSON/ complejos. Para ellos, creamos la clase *Paper*:
#+begin_src java
package org.RI.P2;
import java.util.List;
import com.google.gson.annotations.SerializedName;
class Affiliation {
String laboratory;
String institution;
}
class Author {
String first;
List<String> middle;
String last;
String suffix;
Affiliation affiliation;
String email;
}
class Metadata {
String title;
List<Author> authors;
}
class Abstract {
String text;
}
public class Paper {
String paper_id;
Metadata metadata;
@SerializedName("abstract") List<Abstract> abstr;
}
#+end_src
Es esencial utilizar el mismo nombre de atributo en nuestra clase, dado que esto permitirá un /mapping/ correcto. En caso de que no fuera posible, podemos hacer uso del decorador *@SerializedName*.
El único paso que nos falta es indicarle a la librería la entrada (/JSON/) y la clase.
#+begin_src java
Gson gson = new Gson();
Paper data = gson.fromJson(readerJson, Paper.class);
#+end_src
** Indexación
Para la indexación, hemos elegido los siguientes atributos:
- =paper_id=
- title
- authors
- institution
- emails
- abstract
Optamos por crear un índice en cada ejecución, para evitar obtener valores repetidos.
** Buscador
Implementamos un buscador con interfaz gráfica (GUI), basada en el proyecto de ejemplo disponible en la plataforma Prado.
Podemos optar por buscar en diferentes campos, mediante un argumento pasado por línea de comandos. Especificamos las distintas opciones en la sección siguiente.
** Ejecución
En el caso que deseemos utilizar /Maven/, debemos ejecutar los siguientes comandos:
1. Compilar el proyecto
#+BEGIN_SRC shell
mvn compile
#+END_SRC
2. Ejecutar el proyecto
#+BEGIN_SRC shell
mvn exec:java -Dexec.mainClass="org.RI.P2.Searcher" -Dexec.args="data title"
#+END_SRC
Debemos modificar el argumento *title* según la salida que deseemos:
- *title*
- *authors*
- *abstract*
- *institutions*
- *emails*

BIN
docs/Summary.pdf Normal file

Binary file not shown.

11
pom.xml
View File

@@ -30,9 +30,14 @@
<version>8.6.3</version> <version>8.6.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.googlecode.json-simple</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>json-simple</artifactId> <artifactId>lucene-queryparser</artifactId>
<version>1.1.1</version> <version>8.6.3</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@@ -1,79 +1,135 @@
package org.RI.P2; package org.RI.P2;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.text.ParseException; import java.text.ParseException;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.json.simple.JSONArray; import com.google.gson.Gson;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
public class Indexer { public class Indexer {
IndexWriter index; IndexWriter index;
String folderPath; String dataPath;
String indexPath;
List<File> files;
PerFieldAnalyzerWrapper customAnalyzer; PerFieldAnalyzerWrapper customAnalyzer;
Indexer(String folderPath) throws IOException, ParseException { Indexer(String dataPath, String indexPath) throws IOException, ParseException {
this.folderPath = folderPath; this.dataPath = dataPath;
this.indexPath = indexPath;
files = readFiles();
customAnalyzer = createAnalyzer(); customAnalyzer = createAnalyzer();
} }
PerFieldAnalyzerWrapper createAnalyzer() { PerFieldAnalyzerWrapper createAnalyzer() {
Map<String, Analyzer> analyzerPerField = new HashMap<>(); Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("document_id", new StandardAnalyzer());
analyzerPerField.put("title", new EnglishAnalyzer()); analyzerPerField.put("title", new EnglishAnalyzer());
analyzerPerField.put("abstract", new EnglishAnalyzer()); analyzerPerField.put("abstract", new EnglishAnalyzer());
analyzerPerField.put("authors", new StandardAnalyzer());
analyzerPerField.put("institutions", new StandardAnalyzer());
analyzerPerField.put("emails", new StandardAnalyzer());
PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), PerFieldAnalyzerWrapper customAnalyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(),
analyzerPerField); analyzerPerField);
return customAnalyzer; return customAnalyzer;
} }
JSONArray parseJSONFile(String filePath) throws IOException, ParseException { List<File> readFiles() throws IOException {
InputStream jsonFile = getClass().getResourceAsStream(filePath); List<File> files = Files.walk(Paths.get(dataPath)).filter(Files::isRegularFile).map(Path::toFile)
Reader readerJson = new InputStreamReader(jsonFile); .collect(Collectors.toList());
Object fileObjects = JSONValue.parse(readerJson); return files;
JSONArray arrayObjects = (JSONArray) fileObjects;
return arrayObjects;
} }
void openIndex() throws IOException { Paper parseJSONFile(File file) throws IOException {
Directory dir = FSDirectory.open(Paths.get(folderPath)); InputStream jsonFile = new FileInputStream(file);
Reader readerJson = new InputStreamReader(jsonFile);
Gson gson = new Gson();
Paper data = gson.fromJson(readerJson, Paper.class);
return data;
}
void createIndex() throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexPath));
IndexWriterConfig config = new IndexWriterConfig(customAnalyzer); IndexWriterConfig config = new IndexWriterConfig(customAnalyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setOpenMode(OpenMode.CREATE);
index = new IndexWriter(dir, config); index = new IndexWriter(dir, config);
} }
void addDocuments(JSONArray jsonObjects) throws IOException { void populatePaperMetadata(Paper paper, StringBuilder authors, StringBuilder institutions, StringBuilder emails) {
for (JSONObject object : (List<JSONObject>) jsonObjects) { for (Author author : paper.metadata.authors) {
Document doc = new Document(); String authorName = author.first + " " + author.middle + " " + author.last + " ";
index.addDocument(doc); authorName = authorName.replaceAll("\\p{P}", "");
authors.append(authorName);
institutions.append(author.affiliation.institution);
emails.append(author.email);
} }
} }
void populateFullAbstract(Paper paper, StringBuilder fullAbstract) {
for (Abstract abstr : paper.abstr) {
fullAbstract.append(abstr.text);
}
}
void populateDocumentFields(Paper paper, Document doc) {
doc.add(new StringField("document_id", paper.paper_id, Field.Store.YES));
doc.add(new TextField("title", paper.metadata.title, Field.Store.YES));
StringBuilder authors = new StringBuilder();
StringBuilder institutions = new StringBuilder();
StringBuilder emails = new StringBuilder();
populatePaperMetadata(paper, authors, institutions, emails);
doc.add(new TextField("authors", authors.toString(), Field.Store.YES));
doc.add(new TextField("institution", institutions.toString(), Field.Store.NO));
doc.add(new TextField("emails", emails.toString(), Field.Store.NO));
StringBuilder fullAbstract = new StringBuilder();
populateFullAbstract(paper, fullAbstract);
doc.add(new TextField("abstract", fullAbstract.toString(), Field.Store.NO));
}
void addDocument(Paper paper) throws IOException {
Document doc = new Document();
populateDocumentFields(paper, doc);
index.addDocument(doc);
}
void commitChanges() throws IOException { void commitChanges() throws IOException {
index.commit(); index.commit();
index.close(); index.close();
} }
void createIndex() throws IOException, ParseException { void populateIndex() throws IOException, ParseException {
JSONArray jsonObjects = parseJSONFile(folderPath); createIndex();
openIndex(); for (File file : files) {
addDocuments(jsonObjects); Paper paper = parseJSONFile(file);
if (paper != null) {
addDocument(paper);
}
}
commitChanges(); commitChanges();
} }
} }

View File

@@ -0,0 +1,35 @@
package org.RI.P2;
import java.util.List;
import com.google.gson.annotations.SerializedName;
class Affiliation {
String laboratory;
String institution;
}
class Author {
String first;
List<String> middle;
String last;
String suffix;
Affiliation affiliation;
String email;
}
class Metadata {
String title;
List<Author> authors;
}
class Abstract {
String text;
}
public class Paper {
String paper_id;
Metadata metadata;
@SerializedName("abstract")
List<Abstract> abstr;
}

View File

@@ -0,0 +1,196 @@
package org.RI.P2;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import javax.swing.table.DefaultTableModel;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.queryparser.classic.QueryParser;
public class Searcher extends javax.swing.JFrame {
IndexSearcher searcher;
String dataPath;
String indexPath;
String field;
Searcher(String dataPath, String indexPath, String field) throws IOException {
this.dataPath = dataPath;
this.indexPath = indexPath;
this.field = field;
searcher = createIndexSearcher();
initComponents();
}
private void initComponents() {
jTextField1 = new javax.swing.JTextField();
jButton1 = new javax.swing.JButton();
jScrollPane1 = new javax.swing.JScrollPane();
jTable1 = new javax.swing.JTable();
setDefaultCloseOperation(javax.swing.WindowConstants.EXIT_ON_CLOSE);
jTextField1.setToolTipText("");
jButton1.setText("Buscar");
jButton1.setToolTipText("");
jButton1.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
try {
jButton1ActionPerformed(evt);
} catch (IOException exp) {
System.err.println(exp);
} catch (org.apache.lucene.queryparser.classic.ParseException exp) {
System.err.println(exp);
}
}
});
jTable1.setModel(new javax.swing.table.DefaultTableModel(new Object[][] {
}, new String[] { "Titulo", "Autores" }));
jScrollPane1.setViewportView(jTable1);
javax.swing.GroupLayout layout = new javax.swing.GroupLayout(getContentPane());
getContentPane().setLayout(layout);
layout.setHorizontalGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(layout.createSequentialGroup().addContainerGap(19, Short.MAX_VALUE)
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING, false)
.addGroup(layout.createSequentialGroup().addComponent(jTextField1)
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.UNRELATED)
.addComponent(jButton1))
.addComponent(jScrollPane1, javax.swing.GroupLayout.PREFERRED_SIZE, 375,
javax.swing.GroupLayout.PREFERRED_SIZE))
.addContainerGap()));
layout.setVerticalGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(layout.createSequentialGroup().addGap(15, 15, 15)
.addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.BASELINE)
.addComponent(jTextField1, javax.swing.GroupLayout.PREFERRED_SIZE,
javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE)
.addComponent(jButton1))
.addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
.addComponent(jScrollPane1, javax.swing.GroupLayout.PREFERRED_SIZE, 275,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addContainerGap(javax.swing.GroupLayout.DEFAULT_SIZE, Short.MAX_VALUE)));
pack();
}
private void jButton1ActionPerformed(java.awt.event.ActionEvent evt)
throws IOException, org.apache.lucene.queryparser.classic.ParseException {
searchFiles(jTextField1.getText(), field);
}
IndexSearcher createIndexSearcher() throws IOException {
Directory indexDirectory = FSDirectory.open(Paths.get(indexPath));
IndexReader indexReader = DirectoryReader.open(indexDirectory);
searcher = new IndexSearcher(indexReader);
return searcher;
}
TopDocs queryIndex(String queryString, String field, int resultNumber)
throws IOException, org.apache.lucene.queryparser.classic.ParseException {
Query query = new QueryParser(field, new StandardAnalyzer()).parse(queryString);
TopDocs topDocs = searcher.search(query, resultNumber);
return topDocs;
}
void showResults(TopDocs docs) throws IOException {
System.out.println(docs.totalHits);
DefaultTableModel model = (DefaultTableModel) jTable1.getModel();
for (ScoreDoc scoreDoc : docs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
model.addRow(new Object[] { doc.get("title"), doc.get("authors") });
}
}
void searchFiles(String query, String field)
throws IOException, org.apache.lucene.queryparser.classic.ParseException {
int resultNumber = 20;
TopDocs results = queryIndex(query, field, resultNumber);
showResults(results);
}
private static void validateField(String fieldContent) {
List<String> availableOptions = new ArrayList<>();
availableOptions.add("title");
availableOptions.add("authors");
availableOptions.add("institutions");
availableOptions.add("abstract");
availableOptions.add("emails");
if (!availableOptions.contains(fieldContent)) {
System.out.println("Wrong field name. Available options:");
System.out.println("authors");
System.out.println("title");
System.out.println("abstract");
System.out.println("institutions");
System.out.println("emails");
System.exit(1);
}
}
private static void usage() {
System.out.println("Usage: Searcher <directory>");
System.exit(1);
}
public static void main(String[] args)
throws IOException, ParseException, org.apache.lucene.queryparser.classic.ParseException {
if (args.length != 2) {
usage();
}
String dataDirectory = args[0];
String indexDirectory = ".index";
String searchField = args[1];
validateField(searchField);
Indexer indexer = new Indexer(dataDirectory, indexDirectory);
indexer.populateIndex();
try {
for (javax.swing.UIManager.LookAndFeelInfo info : javax.swing.UIManager.getInstalledLookAndFeels()) {
if ("Nimbus".equals(info.getName())) {
javax.swing.UIManager.setLookAndFeel(info.getClassName());
break;
}
}
} catch (ClassNotFoundException ex) {
java.util.logging.Logger.getLogger(Searcher.class.getName()).log(java.util.logging.Level.SEVERE, null, ex);
} catch (InstantiationException ex) {
java.util.logging.Logger.getLogger(Searcher.class.getName()).log(java.util.logging.Level.SEVERE, null, ex);
} catch (IllegalAccessException ex) {
java.util.logging.Logger.getLogger(Searcher.class.getName()).log(java.util.logging.Level.SEVERE, null, ex);
} catch (javax.swing.UnsupportedLookAndFeelException ex) {
java.util.logging.Logger.getLogger(Searcher.class.getName()).log(java.util.logging.Level.SEVERE, null, ex);
}
/* Create and display the form */
java.awt.EventQueue.invokeLater(new Runnable() {
public void run() {
try {
new Searcher(dataDirectory, indexDirectory, searchField).setVisible(true);
} catch (IOException exp) {
System.err.println(exp);
}
}
});
}
// Variables declaration - do not modify//GEN-BEGIN:variables
private javax.swing.JButton jButton1;
private javax.swing.JScrollPane jScrollPane1;
private javax.swing.JTable jTable1;
private javax.swing.JTextField jTextField1;
// End of variables declaration//GEN-END:variables
}

View File

@@ -0,0 +1,107 @@
<?xml version="1.0" encoding="UTF-8" ?>
<Form version="1.3" maxVersion="1.9" type="org.netbeans.modules.form.forminfo.JFrameFormInfo">
<Properties>
<Property name="defaultCloseOperation" type="int" value="3"/>
</Properties>
<SyntheticProperties>
<SyntheticProperty name="formSizePolicy" type="int" value="1"/>
<SyntheticProperty name="generateCenter" type="boolean" value="false"/>
</SyntheticProperties>
<AuxValues>
<AuxValue name="FormSettings_autoResourcing" type="java.lang.Integer" value="0"/>
<AuxValue name="FormSettings_autoSetComponentName" type="java.lang.Boolean" value="false"/>
<AuxValue name="FormSettings_generateFQN" type="java.lang.Boolean" value="true"/>
<AuxValue name="FormSettings_generateMnemonicsCode" type="java.lang.Boolean" value="false"/>
<AuxValue name="FormSettings_i18nAutoMode" type="java.lang.Boolean" value="false"/>
<AuxValue name="FormSettings_layoutCodeTarget" type="java.lang.Integer" value="1"/>
<AuxValue name="FormSettings_listenerGenerationStyle" type="java.lang.Integer" value="0"/>
<AuxValue name="FormSettings_variablesLocal" type="java.lang.Boolean" value="false"/>
<AuxValue name="FormSettings_variablesModifier" type="java.lang.Integer" value="2"/>
</AuxValues>
<Layout>
<DimensionLayout dim="0">
<Group type="103" groupAlignment="0" attributes="0">
<Group type="102" attributes="0">
<EmptySpace pref="19" max="32767" attributes="0"/>
<Group type="103" groupAlignment="0" max="-2" attributes="0">
<Group type="102" alignment="0" attributes="0">
<Component id="jTextField1" max="32767" attributes="0"/>
<EmptySpace type="unrelated" max="-2" attributes="0"/>
<Component id="jButton1" min="-2" max="-2" attributes="0"/>
</Group>
<Component id="jScrollPane1" min="-2" pref="375" max="-2" attributes="0"/>
</Group>
<EmptySpace max="-2" attributes="0"/>
</Group>
</Group>
</DimensionLayout>
<DimensionLayout dim="1">
<Group type="103" groupAlignment="0" attributes="0">
<Group type="102" alignment="0" attributes="0">
<EmptySpace min="-2" pref="15" max="-2" attributes="0"/>
<Group type="103" groupAlignment="3" attributes="0">
<Component id="jTextField1" alignment="3" min="-2" max="-2" attributes="0"/>
<Component id="jButton1" alignment="3" min="-2" max="-2" attributes="0"/>
</Group>
<EmptySpace max="-2" attributes="0"/>
<Component id="jScrollPane1" min="-2" pref="275" max="-2" attributes="0"/>
<EmptySpace max="32767" attributes="0"/>
</Group>
</Group>
</DimensionLayout>
</Layout>
<SubComponents>
<Component class="javax.swing.JTextField" name="jTextField1">
<Properties>
<Property name="toolTipText" type="java.lang.String" value=""/>
</Properties>
</Component>
<Component class="javax.swing.JButton" name="jButton1">
<Properties>
<Property name="text" type="java.lang.String" value="Buscar"/>
<Property name="toolTipText" type="java.lang.String" value=""/>
</Properties>
<Events>
<EventHandler event="actionPerformed" listener="java.awt.event.ActionListener" parameters="java.awt.event.ActionEvent" handler="jButton1ActionPerformed"/>
</Events>
</Component>
<Container class="javax.swing.JScrollPane" name="jScrollPane1">
<AuxValues>
<AuxValue name="autoScrollPane" type="java.lang.Boolean" value="true"/>
</AuxValues>
<Layout class="org.netbeans.modules.form.compat2.layouts.support.JScrollPaneSupportLayout"/>
<SubComponents>
<Component class="javax.swing.JTable" name="jTable1">
<Properties>
<Property name="model" type="javax.swing.table.TableModel" editor="org.netbeans.modules.form.editors2.TableModelEditor">
<Table columnCount="2" rowCount="0">
<Column editable="true" title="Titulo" type="java.lang.Object"/>
<Column editable="true" title="A&#xf1;o" type="java.lang.Object"/>
</Table>
</Property>
<Property name="columnModel" type="javax.swing.table.TableColumnModel" editor="org.netbeans.modules.form.editors2.TableColumnModelEditor">
<TableColumnModel selectionModel="0">
<Column maxWidth="-1" minWidth="-1" prefWidth="-1" resizable="true">
<Title/>
<Editor/>
<Renderer/>
</Column>
<Column maxWidth="-1" minWidth="-1" prefWidth="-1" resizable="true">
<Title/>
<Editor/>
<Renderer/>
</Column>
</TableColumnModel>
</Property>
<Property name="tableHeader" type="javax.swing.table.JTableHeader" editor="org.netbeans.modules.form.editors2.JTableHeaderEditor">
<TableHeader reorderingAllowed="true" resizingAllowed="true"/>
</Property>
</Properties>
</Component>
</SubComponents>
</Container>
</SubComponents>
</Form>