Skip to content

Commit 9f5dd7e

Browse files
authored
Merge pull request #14 from loodos/dev
bug fix on sentence analysis and version 0.2.3
2 parents 7152f44 + 4672384 commit 9f5dd7e

File tree

4 files changed

+50
-21
lines changed

4 files changed

+50
-21
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setuptools.setup(
88
name='zemberek-python',
9-
version='0.2.2',
9+
version='0.2.3',
1010
author='Loodos',
1111
description='Python port of open source text processing library for Turkish, zemberek-nlp',
1212
long_description=long_description,

zemberek/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
import sys
77

8-
__version__ = '0.2.2'
8+
__version__ = '0.2.3'
99

1010
root = logging.getLogger()
1111
root.setLevel(logging.INFO)

zemberek/morphology/ambiguity/perceptron_ambiguity_resolver.py

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from __future__ import annotations
22

3-
from typing import List, TYPE_CHECKING, DefaultDict, Any, Optional
3+
from typing import List, TYPE_CHECKING, DefaultDict, Any, Optional, Tuple, Dict
44

55
from operator import attrgetter
6-
from collections import defaultdict
6+
from collections import defaultdict, OrderedDict
77
import numpy as np
88

99
if TYPE_CHECKING:
@@ -65,13 +65,18 @@ def last_group(self) -> str:
6565

6666
class FeatureExtractor:
6767

68+
feature_cache: Dict[Tuple[SingleAnalysis, ...], DefaultDict[Any, np.int32]] = dict()
69+
6870
def __init__(self, use_cache: bool):
6971
self.use_cache = use_cache
7072

7173
def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any, np.int32]:
7274

7375
if self.use_cache:
74-
raise ValueError(f"feature cache for FeatureExtractor has not been implemented yet!")
76+
# raise ValueError(f"feature cache for FeatureExtractor has not been implemented yet!")
77+
cached = self.feature_cache.get(tuple(trigram))
78+
if cached is not None:
79+
return cached
7580

7681
feats = defaultdict(np.int32)
7782

@@ -89,7 +94,7 @@ def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any
8994
r2: str = w2.lemma
9095
r3: str = w3.lemma
9196

92-
ig1: str = '+'.join(w1.igs)
97+
# ig1: str = '+'.join(w1.igs)
9398
ig2: str = '+'.join(w2.igs)
9499
ig3: str = '+'.join(w3.igs)
95100

@@ -118,8 +123,12 @@ def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any
118123

119124
feats[f"22:{trigram[2].group_boundaries.shape[0]}"] += 1
120125

121-
for k in feats.keys():
122-
feats[k] = np.int32(feats[k])
126+
# do this outside
127+
# for k in feats.keys():
128+
# feats[k] = np.int32(feats[k])
129+
130+
if self.use_cache:
131+
self.feature_cache[tuple(trigram)] = feats
123132

124133
return feats
125134

@@ -138,12 +147,23 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve
138147
PerceptronAmbiguityResolver.sentence_begin,
139148
PerceptronAmbiguityResolver.sentence_begin,
140149
previous=None,
141-
score=0
150+
score=np.float32(0)
142151
)
143152
]
153+
# current_list: OrderedDict['PerceptronAmbiguityResolver.Hypothesis', np.float32] = OrderedDict(
154+
# [
155+
# (PerceptronAmbiguityResolver.Hypothesis(
156+
# PerceptronAmbiguityResolver.sentence_begin,
157+
# PerceptronAmbiguityResolver.sentence_begin,
158+
# previous=None,
159+
# score=np.float32(0)
160+
# ), np.float32(0))
161+
# ]
162+
# )
144163

145164
for analysis_data in sentence:
146165
next_list: List['PerceptronAmbiguityResolver.Hypothesis'] = []
166+
# next_list: OrderedDict['PerceptronAmbiguityResolver.Hypothesis', np.float32] = OrderedDict()
147167

148168
analyses: List[SingleAnalysis] = list(analysis_data.analysis_results)
149169

@@ -157,15 +177,26 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve
157177

158178
trigram_score = np.float32(0)
159179
for key in features.keys():
160-
trigram_score += self.model.get_(key) * features.get(key)
180+
trigram_score += np.float32(self.model.get_(key) * np.float32(features.get(key)))
161181

162182
new_hyp = PerceptronAmbiguityResolver.Hypothesis(
163183
h.current,
164184
analysis,
165185
h,
166-
score=h.score + trigram_score
186+
score=np.float32(h.score + trigram_score)
167187
)
168-
next_list.append(new_hyp)
188+
189+
i, found = next(((i, c) for i, c in enumerate(next_list) if new_hyp == c), (None, None))
190+
191+
if found is not None and new_hyp.score > found.score:
192+
next_list[i] = new_hyp
193+
elif found is None:
194+
next_list.append(new_hyp)
195+
# if new_hyp in next_list:
196+
# new_hyp.score = max(next_list[new_hyp], new_hyp.score)
197+
198+
# next_list[new_hyp] = new_hyp.score
199+
# next_list.append(new_hyp)
169200

170201
current_list = next_list
171202

@@ -175,7 +206,7 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve
175206

176207
trigram_score = np.float32(0)
177208
for key in features.keys():
178-
trigram_score += self.model.get_(key) * features.get(key)
209+
trigram_score += np.float32(self.model.get_(key) * np.float32(features.get(key)))
179210

180211
h.score += trigram_score
181212

@@ -189,10 +220,8 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve
189220

190221
return PerceptronAmbiguityResolver.DecodeResult(list(reversed(result)), best_score)
191222

192-
193-
194223
class DecodeResult:
195-
def __init__(self, best_parse: List[SingleAnalysis], score: float):
224+
def __init__(self, best_parse: List[SingleAnalysis], score: np.float32):
196225
self.best_parse = best_parse
197226
self.score = score
198227

@@ -202,7 +231,7 @@ def __init__(
202231
prev: SingleAnalysis,
203232
current: SingleAnalysis,
204233
previous: Optional['PerceptronAmbiguityResolver.Hypothesis'],
205-
score: float
234+
score: np.float32
206235
):
207236
self.prev = prev
208237
self.current = current

zemberek/morphology/turkish_morphology.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ def create_with_defaults() -> 'TurkishMorphology':
6363
logger.info(f"TurkishMorphology instance initialized in {time.time() - start_time}")
6464
return instance
6565

66-
@lru_cache(maxsize=200)
67-
def analyze(self, word: str) -> WordAnalysis:
68-
return self.analyze_without_cache(word=word)
66+
@lru_cache(maxsize=250)
67+
def analyze(self, word: str = None, token: Token = None) -> WordAnalysis:
68+
return self.analyze_without_cache(word=word, token=token)
6969

7070
@staticmethod
7171
def normalize_for_analysis(word: str) -> str:
@@ -81,7 +81,7 @@ def analyze_sentence(self, sentence: str) -> List[WordAnalysis]:
8181

8282
normalized = TextUtil.normalize_quotes_hyphens(sentence)
8383
result = [
84-
self.analyze_without_cache(token=t) for t in self.tokenizer.tokenize(normalized)
84+
self.analyze(token=t) for t in self.tokenizer.tokenize(normalized)
8585
]
8686

8787
return result

0 commit comments

Comments
 (0)