Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions patapsco/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,14 @@ class IRDSDocumentReader(InputIterator, NoGlobSupport):
The documents are downloaded to ~/.ir_datasets/
"""

def __init__(self, path, encoding, lang, **kwargs):
def __init__(self, path, encoding, lang, fields=None, **kwargs):
"""
Args:
path (str): ir_datasets name.
encoding (str): Ignored.
lang (str): Language of the documents.
fields (str): ir_dataset document field(s) that will be extracted. Use `+` to indicate multiple fields.
If None, fall back to `default_text()` or `text`.
**kwargs (dict): Unused.
"""
import ir_datasets
Expand All @@ -146,16 +148,30 @@ def __init__(self, path, encoding, lang, **kwargs):
dataset_lang = LangStandardizer.iso_639_3(self.dataset.docs.lang)
assert dataset_lang == self.lang, f"Document language code from {path} is not {lang} but {dataset_lang}."
self.reader = iter(self.dataset.docs)
self.fields = fields.strip().split('+') if fields is not None else None

irds_doc_fields = self.dataset.docs_cls().__annotations__
if self.fields:
assert all( f in irds_doc_fields and irds_doc_fields[f] == str for f in self.fields ),\
f"Fields {self.fields} are not supported by irds/{self.path}."

def __iter__(self):
return self

def __next__(self):
doc = next(self.reader)
return Doc(doc.doc_id, self.lang, doc.text, None)
return Doc(doc.doc_id, self.lang, self._get_text(doc),
getattr(doc, 'time', None) or getattr(doc, 'date', None) )

def __len__(self):
return len(self.dataset)
return len(self.dataset.docs)

def _get_text(self, doc):
if self.fields:
return " ".join([getattr(doc, f) for f in self.fields])
if hasattr(doc, "default_text"):
return doc.default_text()
return doc.text


class DocWriter(Task):
Expand Down
1 change: 1 addition & 0 deletions patapsco/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class DocumentsInputConfig(BaseConfig):
lang: str
encoding: str = "utf8"
path: Union[str, list]
fields: Optional[str]


class DocumentsConfig(SectionConfig):
Expand Down
17 changes: 13 additions & 4 deletions patapsco/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def _extract_fields(cls, fields_str):
try:
return [cls.FIELD_MAP[f.lower()] for f in fields]
except KeyError as e:
raise ConfigError(f"Unrecognized topic field: {e}")
LOGGER.warning(f"Using unrecognized topic fields {e}, may cause unexpected results.")
return fields
Comment on lines +93 to +94
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why wouldn't we want this to throw an exception?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing this is because of item 2:

Support arbitrary fields in TopicProcessor for abitrary query fields in ir_datasets. This is particularly important for integrating the mt_* and ht_* fields in the HC4 interface in ir_datasets. (citing https://github.com/allenai/ir_datasets/issues/148)

I'll have to look more into this as I prefer to keep the checking in to catch typos or other problems with data.



class SgmlTopicReader(InputIterator):
Expand Down Expand Up @@ -257,16 +258,24 @@ def __init__(self, path, encoding, lang, **kwargs):
self.path = path
self.lang = lang
self.dataset = ir_datasets.load(self.path)
dataset_lang = LangStandardizer.iso_639_3(self.dataset.queries.lang)
assert dataset_lang == self.lang, f"Query language code from {path} is not {lang} but {dataset_lang}."
Comment on lines -260 to -261
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A user must list the language in the topics config, but it is not checked against the language of the downloaded dataset. Is the language code of the dataset not consistent? I'd probably put a comment in there to indicate we're not checking the language because xyz.

self.queries = iter(self.dataset.queries)

def __iter__(self):
return self

def __next__(self):
q = next(self.queries)
return Topic(q.query_id, self.lang, q.text, getattr(q, 'description', None), None)
topic = Topic(q.query_id, self.lang, None, None, None)

for field in self.dataset.queries_cls()._fields:
if field in ['query_id']:
continue
elif field == 'description':
topic.desc = q.description
else:
setattr(topic, field, getattr(q, field))

return topic

def __len__(self):
return len(self.dataset.queries)
Expand Down
78 changes: 78 additions & 0 deletions samples/configs/irds_hc4.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
run:
name: ir_datasets testing using hc4_zho_test

documents:
input:
format: irds
lang: zho
path: hc4/zh/test
fields: title+text
process:
normalize:
lowercase: true
report: false
stem: false
stopwords: lucene
strict_check: true
tokenize: spacy
output: true

database:
name: sqlite
output: true

index:
name: lucene
output: true

topics:
input:
format: irds
lang: eng
source: original
encoding: utf8
path: hc4/zh/test
fields: title

queries:
output: true
parse: false
process:
normalize:
lowercase: true
report: false
stem: false
stopwords: lucene
strict_check: false
tokenize: spacy
psq:
lang: eng
normalize:
lowercase: true
report: false
path: ./samples/data/eng_zho_transtable_small.dict
stem: false
stopwords: lucene
threshold: 0.97

retrieve:
b: 0.4
fb_docs: 10
fb_terms: 10
k1: 0.9
log_explanations: false
log_explanations_cutoff: 10
mu: 1000
name: bm25
number: 1000
original_query_weight: 0.5
output: retrieve
parse: false
psq: true
rm3: false
rm3_logging: false

score:
input:
format: irds
path: hc4/zh/test
48 changes: 48 additions & 0 deletions samples/configs/irds_hc4_ht_query.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
run:
name: ir_datasets testing using hc4_zho_test ht queries

topics:
input:
format: irds
lang: zho
source: original
encoding: utf8
path: hc4/zh/test
fields: ht_title

queries:
output: true
parse: false
process:
normalize:
lowercase: true
report: false
stem: false
stopwords: lucene
strict_check: false
tokenize: spacy

retrieve:
input:
index:
path: ./runs/ir_datasets-testing-using-hc4_zho_test/index
b: 0.4
fb_docs: 10
fb_terms: 10
k1: 0.9
log_explanations: false
log_explanations_cutoff: 10
mu: 1000
name: bm25
number: 1000
original_query_weight: 0.5
output: retrieve
parse: false
psq: false
rm3: false
rm3_logging: false

score:
input:
format: irds
path: hc4/zh/test
44 changes: 0 additions & 44 deletions samples/configs/irds_test.yml

This file was deleted.

1 change: 1 addition & 0 deletions samples/data/eng_zho_transtable_small.dict

Large diffs are not rendered by default.