hltcoe · eugene-yang · Mar 11, 2022 · Mar 29, 2022 · cash · Mar 29, 2022
diff --git a/patapsco/docs.py b/patapsco/docs.py
@@ -131,12 +131,14 @@ class IRDSDocumentReader(InputIterator, NoGlobSupport):
     The documents are downloaded to ~/.ir_datasets/
     """
 
-    def __init__(self, path, encoding, lang, **kwargs):
+    def __init__(self, path, encoding, lang, fields=None, **kwargs):
         """
         Args:
             path (str): ir_datasets name.
             encoding (str): Ignored.
             lang (str): Language of the documents.
+            fields (str): ir_dataset document field(s) that will be extracted. Use `+` to indicate multiple fields. 
+                          If None, fall back to `default_text()` or `text`.
             **kwargs (dict): Unused.
         """
         import ir_datasets
@@ -146,16 +148,30 @@ def __init__(self, path, encoding, lang, **kwargs):
         dataset_lang = LangStandardizer.iso_639_3(self.dataset.docs.lang)
         assert dataset_lang == self.lang, f"Document language code from {path} is not {lang} but {dataset_lang}."
         self.reader = iter(self.dataset.docs)
+        self.fields = fields.strip().split('+') if fields is not None else None
+
+        irds_doc_fields = self.dataset.docs_cls().__annotations__
+        if self.fields:
+            assert all( f in irds_doc_fields and irds_doc_fields[f] == str for f in self.fields ),\
+                f"Fields {self.fields} are not supported by irds/{self.path}."
 
     def __iter__(self):
         return self
 
     def __next__(self):
         doc = next(self.reader)
-        return Doc(doc.doc_id, self.lang, doc.text, None)
+        return Doc(doc.doc_id, self.lang, self._get_text(doc), 
+                   getattr(doc, 'time', None) or getattr(doc, 'date', None) )
 
     def __len__(self):
-        return len(self.dataset)
+        return len(self.dataset.docs)
+
+    def _get_text(self, doc):
+        if self.fields:
+            return " ".join([getattr(doc, f) for f in self.fields])
+        if hasattr(doc, "default_text"):
+            return doc.default_text()
+        return doc.text
 
 
 class DocWriter(Task):

diff --git a/patapsco/schema.py b/patapsco/schema.py
@@ -55,6 +55,7 @@ class DocumentsInputConfig(BaseConfig):
     lang: str
     encoding: str = "utf8"
     path: Union[str, list]
+    fields: Optional[str]
 
 
 class DocumentsConfig(SectionConfig):

diff --git a/patapsco/topics.py b/patapsco/topics.py
@@ -90,7 +90,8 @@ def _extract_fields(cls, fields_str):
         try:
             return [cls.FIELD_MAP[f.lower()] for f in fields]
         except KeyError as e:
-            raise ConfigError(f"Unrecognized topic field: {e}")
+            LOGGER.warning(f"Using unrecognized topic fields {e}, may cause unexpected results.")
+            return fields
 
 
 class SgmlTopicReader(InputIterator):
@@ -257,16 +258,24 @@ def __init__(self, path, encoding, lang, **kwargs):
         self.path = path
         self.lang = lang
         self.dataset = ir_datasets.load(self.path)
-        dataset_lang = LangStandardizer.iso_639_3(self.dataset.queries.lang)
-        assert dataset_lang == self.lang, f"Query language code from {path} is not {lang} but {dataset_lang}."
         self.queries = iter(self.dataset.queries)
 
     def __iter__(self):
         return self
 
     def __next__(self):
         q = next(self.queries)
-        return Topic(q.query_id, self.lang, q.text, getattr(q, 'description', None), None)
+        topic = Topic(q.query_id, self.lang, None, None, None)
+
+        for field in self.dataset.queries_cls()._fields:
+            if field in ['query_id']:
+                continue
+            elif field == 'description':
+                topic.desc = q.description
+            else:
+                setattr(topic, field, getattr(q, field))
+
+        return topic
 
     def __len__(self):
         return len(self.dataset.queries)

diff --git a/samples/configs/irds_hc4.yml b/samples/configs/irds_hc4.yml
@@ -0,0 +1,78 @@
+run:
+  name: ir_datasets testing using hc4_zho_test
+
+documents:
+  input:
+    format: irds
+    lang: zho
+    path: hc4/zh/test
+    fields: title+text
+  process:
+    normalize:
+      lowercase: true
+      report: false
+    stem: false
+    stopwords: lucene
+    strict_check: true
+    tokenize: spacy
+  output: true
+
+database:
+  name: sqlite
+  output: true
+
+index:
+  name: lucene
+  output: true
+
+topics:
+  input:
+    format: irds
+    lang: eng
+    source: original
+    encoding: utf8
+    path: hc4/zh/test
+  fields: title
+
+queries:
+  output: true
+  parse: false
+  process:
+    normalize:
+      lowercase: true
+      report: false
+    stem: false
+    stopwords: lucene
+    strict_check: false
+    tokenize: spacy
+  psq:
+    lang: eng
+    normalize:
+      lowercase: true
+      report: false
+    path: ./samples/data/eng_zho_transtable_small.dict 
+    stem: false
+    stopwords: lucene
+    threshold: 0.97
+
+retrieve:
+  b: 0.4
+  fb_docs: 10
+  fb_terms: 10
+  k1: 0.9
+  log_explanations: false
+  log_explanations_cutoff: 10
+  mu: 1000
+  name: bm25
+  number: 1000
+  original_query_weight: 0.5
+  output: retrieve
+  parse: false
+  psq: true
+  rm3: false
+  rm3_logging: false
+
+score:
+  input:
+    format: irds
+    path: hc4/zh/test
diff --git a/samples/configs/irds_hc4_ht_query.yml b/samples/configs/irds_hc4_ht_query.yml
@@ -0,0 +1,48 @@
+run:
+  name: ir_datasets testing using hc4_zho_test ht queries
+
+topics:
+  input:
+    format: irds
+    lang: zho
+    source: original
+    encoding: utf8
+    path: hc4/zh/test
+  fields: ht_title
+
+queries:
+  output: true
+  parse: false
+  process:
+    normalize:
+      lowercase: true
+      report: false
+    stem: false
+    stopwords: lucene
+    strict_check: false
+    tokenize: spacy
+
+retrieve:
+  input: 
+    index:
+      path: ./runs/ir_datasets-testing-using-hc4_zho_test/index
+  b: 0.4
+  fb_docs: 10
+  fb_terms: 10
+  k1: 0.9
+  log_explanations: false
+  log_explanations_cutoff: 10
+  mu: 1000
+  name: bm25
+  number: 1000
+  original_query_weight: 0.5
+  output: retrieve
+  parse: false
+  psq: false
+  rm3: false
+  rm3_logging: false
+
+score:
+  input:
+    format: irds
+    path: hc4/zh/test
diff --git a/samples/configs/irds_test.yml b/samples/configs/irds_test.yml
diff --git a/samples/data/eng_zho_transtable_small.dict b/samples/data/eng_zho_transtable_small.dict