-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_processor.py
More file actions
92 lines (78 loc) · 3.17 KB
/
text_processor.py
File metadata and controls
92 lines (78 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bot import STOPWORDS_MAP, NLTK_DEPENDENCIES
import logging
class TextProcessor:
def __init__(self, language='en'):
self.language = language
self.lemmatizer = WordNetLemmatizer()
self.stemmer = self._get_stemmer()
self.stop_words = self._get_stopwords()
self._ensure_nltk_data()
def _ensure_nltk_data(self):
for dependency in NLTK_DEPENDENCIES:
try:
nltk.data.find(f'tokenizers/{dependency}')
except LookupError:
try:
nltk.data.find(f'corpora/{dependency}')
except LookupError:
logging.info(f"Downloading NLTK dependency: {dependency}")
nltk.download(dependency, quiet=True)
def _get_stemmer(self):
stemmer_map = {
'en': 'english',
'tr': 'turkish',
'de': 'german'
}
if self.language in stemmer_map:
return SnowballStemmer(stemmer_map[self.language])
else:
return SnowballStemmer('english')
def _get_stopwords(self):
try:
if self.language in STOPWORDS_MAP:
return set(stopwords.words(STOPWORDS_MAP[self.language]))
else:
return set(stopwords.words('english'))
except LookupError:
logging.warning(f"Stopwords not available for {self.language}, using English")
return set(stopwords.words('english'))
def clean_text(self, text):
if not isinstance(text, str):
return ""
text = re.sub(r'[^a-zA-ZäöüßÄÖÜçğıİöşüÇĞIıÖŞÜ\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text.lower()
def tokenize(self, text):
try:
return word_tokenize(text, language=STOPWORDS_MAP.get(self.language, 'english'))
except:
return text.split()
def remove_stopwords(self, tokens):
return [token for token in tokens if token not in self.stop_words and len(token) > 1]
def lemmatize_tokens(self, tokens):
if self.language == 'en':
return [self.lemmatizer.lemmatize(token) for token in tokens]
else:
return [self.stemmer.stem(token) for token in tokens]
def preprocess(self, text):
cleaned_text = self.clean_text(text)
if not cleaned_text:
return ""
tokens = self.tokenize(cleaned_text)
tokens = self.remove_stopwords(tokens)
tokens = self.lemmatize_tokens(tokens)
tokens = [token for token in tokens if token]
return ' '.join(tokens)
def set_language(self, language):
if language in STOPWORDS_MAP:
self.language = language
self.stemmer = self._get_stemmer()
self.stop_words = self._get_stopwords()
else:
logging.warning(f"Language {language} not supported, using English")
self.language = 'en'