Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,6 @@ jobs:
runs-on: ubuntu-latest
needs: validate-compute-block
services:
minio:
image: lazybit/minio
ports:
- 9000:9000
env:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
options: >-
--health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
--health-interval 5s
--health-retries 5
--health-timeout 5s
postgres:
image: postgres:15
ports:
Expand Down
29 changes: 24 additions & 5 deletions algorithms/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(
self,
dtm: np.ndarray = None,
vocab: dict = None,
doc_ids: list[str] = [],
n_topics: int = 10,
max_iter: int = 10,
learning_method: str = "batch",
Expand All @@ -21,6 +22,12 @@ def __init__(
):
self.dtm: np.ndarray = dtm
self.vocab: dict = vocab
self.doc_ids = doc_ids

if len(self.doc_ids) != self.dtm.shape[0]:
raise ValueError(
"doc_ids length must match number of DTM rows"
)

self.n_topics = n_topics
self.max_iter = max_iter
Expand Down Expand Up @@ -58,6 +65,7 @@ def extract_doc_topics(self) -> pd.DataFrame:
self.doc_topic_dist,
columns=[f"topic_{i}" for i in range(self.n_topics)],
)
df.insert(0, "doc_id", self.doc_ids)

logger.debug(
f"Extracted doc-topic distribution DataFrame shape={df.shape}")
Expand All @@ -68,17 +76,28 @@ def extract_topic_terms(self):
Generate topic and top-terms DataFrame
"""
logger.info("Extracting top terms per topic...")
idx2term = {idx: term for term, idx in self.vocab.items()}

# NOTE:
# The order of `terms` is guaranteed to match the DTM column order.
# This is because the vocabulary is built in NLPVectorizer using:
# sorted_terms = sorted(all_terms)
# vocab = {term: i for i, term in enumerate(sorted_terms)}
# The same vocab indices are then used to construct the DTM columns.
# Since Python dicts preserve insertion order (>=3.7),
# list(self.vocab.keys())[i] correctly maps to DTM column i,
# and thus to lda.components_[topic_idx][i].
terms = list(self.vocab.keys())
topic_rows = []

for topic_idx, topic in enumerate(self.lda.components_):
sorted = np.argsort(topic)[::-1]
top_indices = sorted[:self.n_top_words]
sorted_idx = np.argsort(topic)[::-1]
top_indices = sorted_idx[: self.n_top_words]

for i in top_indices:
topic_rows.append({
"topic_id": topic_idx,
"term": idx2term[i],
"weight": topic[i]
"term": terms[int(i)],
"weight": topic[i],
})

df = pd.DataFrame(topic_rows)
Expand Down
8 changes: 8 additions & 0 deletions algorithms/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import List
from dataclasses import dataclass


@dataclass
class PreprocessedDocument:
doc_id: str
tokens: List[str]
103 changes: 103 additions & 0 deletions algorithms/vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from typing import List
import numpy as np
from collections import Counter

from algorithms.models import PreprocessedDocument


class NLPVectorizer:
def __init__(self, preprocessed_output: List[PreprocessedDocument]):
self.documents = preprocessed_output
self.doc_ids = [doc.doc_id for doc in preprocessed_output]

# Frequencies
self.token_frequency = Counter()
self.token_document_frequency = Counter()
self.ngram_frequency = Counter()
self.ngram_document_frequency = Counter()

# bow + dtm
self.bag_of_words = []
self.vocab = {}
self.reverse_vocab = []
self.dtm = None

def analyze_frequencies(self):
for doc in self.documents:
tokens = [t for t in doc.tokens if " " not in t]
ngrams = [t for t in doc.tokens if " " in t]

# token frequencies
self.token_frequency.update(tokens)
self.token_document_frequency.update(set(tokens))

# ngram frequencies
self.ngram_frequency.update(ngrams)
self.ngram_document_frequency.update(set(ngrams))

def build_bow(self):
bow = []

for doc in self.documents:
entries = []
unique = set()

for term in doc.tokens:
if term in unique:
continue
unique.add(term)

is_ngram = " " in term

entry = {
"term": term,
"type": "ngram" if is_ngram else "word",
"span": len(term.split(" ")),
"freq": (
self.ngram_frequency[term]
if is_ngram
else self.token_frequency[term]
),
"docs": (
self.ngram_document_frequency[term]
if is_ngram
else self.token_document_frequency[term]
),
"filters": []
}

entries.append(entry)

bow.append(entries)

self.bag_of_words = bow
return bow

def build_vocabulary(self):
all_terms = set()

for doc in self.documents:
for term in doc.tokens:
all_terms.add(term)

sorted_terms = sorted(all_terms)
self.vocab = {term: i for i, term in enumerate(sorted_terms)}
self.reverse_vocab = sorted_terms

return self.vocab

def build_dtm(self):
if not self.vocab:
self.build_vocabulary()

num_docs = len(self.documents)
num_terms = len(self.vocab)

dtm = np.zeros((num_docs, num_terms), dtype=int)

for i, doc in enumerate(self.documents):
for term in doc.tokens:
dtm[i, self.vocab[term]] += 1

self.dtm = dtm
return dtm
35 changes: 10 additions & 25 deletions cbc.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,24 @@
author: Paul Kalhorn
description: Compute Block that offers Topic Modeling Algorihtms
description: Compute Block that offers topic modeling algorithms
docker_image: ghcr.io/rwth-time/topic-modeling/topic-modeling
entrypoints:
lda_topic_modeling:
description: Sklearn LDA Topic Modeling
description: Sklearn LDA Topic Modeling
envs:
LEARNING_METHOD: batch
MAX_ITER: 10
N_TOPICS: 5
N_TOP_WORDS: 10
inputs:
dtm:
preprocessed_docs:
config:
dtm_BUCKET_NAME: null
dtm_FILE_EXT: pkl
dtm_FILE_NAME: null
dtm_FILE_PATH: null
dtm_S3_ACCESS_KEY: null
dtm_S3_HOST: null
dtm_S3_PORT: null
dtm_S3_SECRET_KEY: null
description: Pkl file of your numpy representation of the document-term matrix
type: file
vocab:
config:
vocab_BUCKET_NAME: null
vocab_FILE_EXT: pkl
vocab_FILE_NAME: null
vocab_FILE_PATH: null
vocab_S3_ACCESS_KEY: null
vocab_S3_HOST: null
vocab_S3_PORT: null
vocab_S3_SECRET_KEY: null
description: Pkl file of a dictionary that maps all words to their index in the DTM
type: file
preprocessed_docs_DB_TABLE: null
preprocessed_docs_PG_HOST: null
preprocessed_docs_PG_PASS: null
preprocessed_docs_PG_PORT: null
preprocessed_docs_PG_USER: null
description: A database table, expected to have the doc_id, and tokens (list of strings)
type: pg_table
outputs:
doc_topic:
config:
Expand Down
Binary file removed dtm.pkl
Binary file not shown.
Loading
Loading