Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name='zemberek-python',
version='0.2.2',
version='0.2.3',
author='Loodos',
description='Python port of open source text processing library for Turkish, zemberek-nlp',
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion zemberek/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import sys

__version__ = '0.2.2'
__version__ = '0.2.3'

root = logging.getLogger()
root.setLevel(logging.INFO)
Expand Down
59 changes: 44 additions & 15 deletions zemberek/morphology/ambiguity/perceptron_ambiguity_resolver.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

from typing import List, TYPE_CHECKING, DefaultDict, Any, Optional
from typing import List, TYPE_CHECKING, DefaultDict, Any, Optional, Tuple, Dict

from operator import attrgetter
from collections import defaultdict
from collections import defaultdict, OrderedDict
import numpy as np

if TYPE_CHECKING:
Expand Down Expand Up @@ -65,13 +65,18 @@ def last_group(self) -> str:

class FeatureExtractor:

feature_cache: Dict[Tuple[SingleAnalysis, ...], DefaultDict[Any, np.int32]] = dict()

def __init__(self, use_cache: bool):
self.use_cache = use_cache

def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any, np.int32]:

if self.use_cache:
raise ValueError(f"feature cache for FeatureExtractor has not been implemented yet!")
# raise ValueError(f"feature cache for FeatureExtractor has not been implemented yet!")
cached = self.feature_cache.get(tuple(trigram))
if cached is not None:
return cached

feats = defaultdict(np.int32)

Expand All @@ -89,7 +94,7 @@ def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any
r2: str = w2.lemma
r3: str = w3.lemma

ig1: str = '+'.join(w1.igs)
# ig1: str = '+'.join(w1.igs)
ig2: str = '+'.join(w2.igs)
ig3: str = '+'.join(w3.igs)

Expand Down Expand Up @@ -118,8 +123,12 @@ def extract_from_trigram(self, trigram: List[SingleAnalysis]) -> DefaultDict[Any

feats[f"22:{trigram[2].group_boundaries.shape[0]}"] += 1

for k in feats.keys():
feats[k] = np.int32(feats[k])
# do this outside
# for k in feats.keys():
# feats[k] = np.int32(feats[k])

if self.use_cache:
self.feature_cache[tuple(trigram)] = feats

return feats

Expand All @@ -138,12 +147,23 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve
PerceptronAmbiguityResolver.sentence_begin,
PerceptronAmbiguityResolver.sentence_begin,
previous=None,
score=0
score=np.float32(0)
)
]
# current_list: OrderedDict['PerceptronAmbiguityResolver.Hypothesis', np.float32] = OrderedDict(
# [
# (PerceptronAmbiguityResolver.Hypothesis(
# PerceptronAmbiguityResolver.sentence_begin,
# PerceptronAmbiguityResolver.sentence_begin,
# previous=None,
# score=np.float32(0)
# ), np.float32(0))
# ]
# )

for analysis_data in sentence:
next_list: List['PerceptronAmbiguityResolver.Hypothesis'] = []
# next_list: OrderedDict['PerceptronAmbiguityResolver.Hypothesis', np.float32] = OrderedDict()

analyses: List[SingleAnalysis] = list(analysis_data.analysis_results)

Expand All @@ -157,15 +177,26 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve

trigram_score = np.float32(0)
for key in features.keys():
trigram_score += self.model.get_(key) * features.get(key)
trigram_score += np.float32(self.model.get_(key) * np.float32(features.get(key)))

new_hyp = PerceptronAmbiguityResolver.Hypothesis(
h.current,
analysis,
h,
score=h.score + trigram_score
score=np.float32(h.score + trigram_score)
)
next_list.append(new_hyp)

i, found = next(((i, c) for i, c in enumerate(next_list) if new_hyp == c), (None, None))

if found is not None and new_hyp.score > found.score:
next_list[i] = new_hyp
elif found is None:
next_list.append(new_hyp)
# if new_hyp in next_list:
# new_hyp.score = max(next_list[new_hyp], new_hyp.score)

# next_list[new_hyp] = new_hyp.score
# next_list.append(new_hyp)

current_list = next_list

Expand All @@ -175,7 +206,7 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve

trigram_score = np.float32(0)
for key in features.keys():
trigram_score += self.model.get_(key) * features.get(key)
trigram_score += np.float32(self.model.get_(key) * np.float32(features.get(key)))

h.score += trigram_score

Expand All @@ -189,10 +220,8 @@ def best_path(self, sentence: List[WordAnalysis]) -> 'PerceptronAmbiguityResolve

return PerceptronAmbiguityResolver.DecodeResult(list(reversed(result)), best_score)



class DecodeResult:
def __init__(self, best_parse: List[SingleAnalysis], score: float):
def __init__(self, best_parse: List[SingleAnalysis], score: np.float32):
self.best_parse = best_parse
self.score = score

Expand All @@ -202,7 +231,7 @@ def __init__(
prev: SingleAnalysis,
current: SingleAnalysis,
previous: Optional['PerceptronAmbiguityResolver.Hypothesis'],
score: float
score: np.float32
):
self.prev = prev
self.current = current
Expand Down
8 changes: 4 additions & 4 deletions zemberek/morphology/turkish_morphology.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def create_with_defaults() -> 'TurkishMorphology':
logger.info(f"TurkishMorphology instance initialized in {time.time() - start_time}")
return instance

@lru_cache(maxsize=200)
def analyze(self, word: str) -> WordAnalysis:
return self.analyze_without_cache(word=word)
@lru_cache(maxsize=250)
def analyze(self, word: str = None, token: Token = None) -> WordAnalysis:
return self.analyze_without_cache(word=word, token=token)

@staticmethod
def normalize_for_analysis(word: str) -> str:
Expand All @@ -81,7 +81,7 @@ def analyze_sentence(self, sentence: str) -> List[WordAnalysis]:

normalized = TextUtil.normalize_quotes_hyphens(sentence)
result = [
self.analyze_without_cache(token=t) for t in self.tokenizer.tokenize(normalized)
self.analyze(token=t) for t in self.tokenizer.tokenize(normalized)
]

return result
Expand Down