Skip to content

Commit 9f44053

Browse files
authored
add example code for language identifier and multi analyzer (#2919)
Signed-off-by: aoiasd <[email protected]>
1 parent 058836d commit 9f44053

File tree

3 files changed

+230
-0
lines changed

3 files changed

+230
-0
lines changed
File renamed without changes.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from pymilvus import MilvusClient, DataType, Function, FunctionType
2+
3+
# 1. Setup Milvus Client
4+
client = MilvusClient("http://localhost:19530")
5+
COLLECTION_NAME = "multilingual_test_B"
6+
if client.has_collection(collection_name=COLLECTION_NAME):
7+
client.drop_collection(collection_name=COLLECTION_NAME)
8+
9+
# 2. Define analyzers for multiple languages
10+
# These individual analyzer definitions will be reused by both methods.
11+
analyzers = {
12+
"Japanese": {
13+
# Use lindera with japanese dict 'ipadic'
14+
# and remove punctuation beacuse lindera tokenizer will remain punctuation
15+
"tokenizer":{
16+
"type": "lindera",
17+
"dict_kind": "ipadic"
18+
},
19+
"filter": ["removepunct"]
20+
},
21+
"English": {
22+
# Use build-in english analyzer
23+
"type": "english",
24+
},
25+
"default": {
26+
# use icu tokenizer as a fallback.
27+
"tokenizer": "icu",
28+
}
29+
}
30+
31+
# --- Option B: Using Language Identifier Tokenizer ---
32+
print("\n--- Demonstrating Language Identifier Tokenizer ---")
33+
34+
# 3B. reate a collection with language identifier
35+
analyzer_params_langid = {
36+
"tokenizer": {
37+
"type": "language_identifier",
38+
"analyzers": analyzers # Referencing the analyzers defined in Step 2
39+
},
40+
}
41+
42+
schema_langid = MilvusClient.create_schema(
43+
auto_id=True,
44+
enable_dynamic_field=False,
45+
)
46+
schema_langid.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
47+
# The 'language' field is not strictly needed by the analyzer itself here, as detection is automatic.# However, you might keep it for metadata purposes.
48+
schema_langid.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=25565, enable_analyzer=True, analyzer_params = analyzer_params_langid)
49+
schema_langid.add_field(field_name="text_sparse", datatype=DataType.SPARSE_FLOAT_VECTOR) # BM25 Sparse Vector# add bm25 function
50+
text_bm25_function_langid = Function(
51+
name="text_bm25",
52+
function_type=FunctionType.BM25,
53+
input_field_names=["text"],
54+
output_field_names=["text_sparse"],
55+
)
56+
schema_langid.add_function(text_bm25_function_langid)
57+
58+
index_params_langid = client.prepare_index_params()
59+
index_params_langid.add_index(
60+
field_name="text_sparse",
61+
index_type="AUTOINDEX", # Use auto index for BM25
62+
metric_type="BM25",
63+
)
64+
65+
client.create_collection(
66+
collection_name=COLLECTION_NAME,
67+
schema=schema_langid,
68+
index_params=index_params_langid
69+
)
70+
print(f"Collection '{COLLECTION_NAME}' created successfully with Language Identifier Tokenizer.")
71+
72+
# 4B. Insert Data for Language Identifier Tokenizer and Load Collection
73+
# Insert English and Japanese movie titles. The language_identifier will detect the language.
74+
client.insert(
75+
collection_name=COLLECTION_NAME,
76+
data=[
77+
{"text": "The Lord of the Rings"},
78+
{"text": "Spirited Away"},
79+
{"text": "千と千尋の神隠し"},
80+
{"text": "君の名は。"},
81+
]
82+
)
83+
print(f"Inserted multilingual data into '{COLLECTION_NAME}'.")
84+
85+
# Load the collection into memory before searching
86+
client.load_collection(collection_name=COLLECTION_NAME)
87+
88+
# 5B. Perform a full-text search with Language Identifier Tokenizer# No need to specify analyzer_name in search_params; it's detected automatically for the query.
89+
print("\n--- Search results for Language Identifier Tokenizer ---")
90+
results_langid_jp = client.search(
91+
collection_name=COLLECTION_NAME,
92+
data=["神隠し"],
93+
limit=2,
94+
output_fields=["text"],
95+
search_params={"metric_type": "BM25"}, # Analyzer automatically determined by language_identifier
96+
consistency_level = "Strong",
97+
)
98+
print("\nSearch results for '神隠し' (Language Identifier Tokenizer):")
99+
for result in results_langid_jp[0]:
100+
print(result)
101+
102+
results_langid_en = client.search(
103+
collection_name=COLLECTION_NAME,
104+
data=["the Rings"],
105+
limit=2,
106+
output_fields=["text"],
107+
search_params={"metric_type": "BM25"}, # Analyzer automatically determined by language_identifier
108+
consistency_level = "Strong",
109+
)
110+
print("\nSearch results for 'the Rings' (Language Identifier Tokenizer):")
111+
for result in results_langid_en[0]:
112+
print(result)
113+
114+
client.drop_collection(collection_name=COLLECTION_NAME)
115+
print(f"Collection '{COLLECTION_NAME}' dropped.")
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from pymilvus import MilvusClient, DataType, Function, FunctionType
2+
3+
# 1. Setup Milvus Client
4+
client = MilvusClient("http://localhost:19530")
5+
COLLECTION_NAME = "multilingual_test_A"
6+
if client.has_collection(collection_name=COLLECTION_NAME):
7+
client.drop_collection(collection_name=COLLECTION_NAME)
8+
9+
# 2. Define analyzers for multiple languages
10+
# These individual analyzer definitions will be reused by both methods.
11+
analyzers = {
12+
"Japanese": {
13+
# Use lindera with japanese dict 'ipadic'
14+
# and remove punctuation beacuse lindera tokenizer will remain punctuation
15+
"tokenizer":{
16+
"type": "lindera",
17+
"dict_kind": "ipadic"
18+
},
19+
"filter": ["removepunct"]
20+
},
21+
"English": {
22+
# Use build-in english analyzer
23+
"type": "english",
24+
},
25+
"default": {
26+
# use icu tokenizer as a fallback.
27+
"tokenizer": "icu",
28+
}
29+
}
30+
31+
# --- Option A: Using Multi-Language Analyzer ---
32+
print("\n--- Demonstrating Multi-Language Analyzer ---")
33+
34+
# 3A. reate a collection with the Multi Analyzer
35+
36+
mutil_analyzer_params = {
37+
"by_field": "language",
38+
"analyzers": analyzers,
39+
}
40+
41+
schema = MilvusClient.create_schema(
42+
auto_id=True,
43+
enable_dynamic_field=False,
44+
)
45+
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)# Apply our multi-language analyzer to the 'title' field
46+
schema.add_field(field_name="language", datatype=DataType.VARCHAR, max_length=255, nullable = True)
47+
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=25565, enable_analyzer=True, multi_analyzer_params = mutil_analyzer_params)
48+
schema.add_field(field_name="text_sparse", datatype=DataType.SPARSE_FLOAT_VECTOR) # Bm25 Sparse Vector
49+
50+
# add bm25 function
51+
text_bm25_function = Function(
52+
name="text_bm25",
53+
function_type=FunctionType.BM25,
54+
input_field_names=["text"],
55+
output_field_names=["text_sparse"],
56+
)
57+
schema.add_function(text_bm25_function)
58+
59+
index_params = client.prepare_index_params()
60+
index_params.add_index(
61+
field_name="text_sparse",
62+
index_type="AUTOINDEX", # Use auto index for BM25
63+
metric_type="BM25",
64+
)
65+
66+
client.create_collection(
67+
collection_name=COLLECTION_NAME,
68+
schema=schema,
69+
index_params=index_params
70+
)
71+
print(f"Collection '{COLLECTION_NAME}' created successfully.")
72+
73+
# 4A. Insert data for Multi-Language Analyzer and load collection# Insert English and Japanese movie titles, explicitly setting the 'language' field
74+
client.insert(
75+
collection_name=COLLECTION_NAME,
76+
data=[
77+
{"text": "The Lord of the Rings", "language": "English"},
78+
{"text": "Spirited Away", "language": "English"},
79+
{"text": "千と千尋の神隠し", "language": "Japanese"}, # This is "Spirited Away" in Japanese
80+
{"text": "君の名は。", "language": "Japanese"}, # This is "Your Name." in Japanese
81+
]
82+
)
83+
print(f"Inserted multilingual data into '{COLLECTION_NAME}'.")
84+
85+
# Load the collection into memory before searching
86+
client.load_collection(collection_name=COLLECTION_NAME)
87+
88+
# 5A. Perform a full-text search with Multi-Language Analyzer# When searching, explicitly specify the analyzer to use for the query string.
89+
print("\n--- Search results for Multi-Language Analyzer ---")
90+
results_multi_jp = client.search(
91+
collection_name=COLLECTION_NAME,
92+
data=["神隠し"],
93+
limit=2,
94+
output_fields=["text"],
95+
search_params={"metric_type": "BM25", "analyzer_name": "Japanese"}, # Specify Japanese analyzer for query
96+
consistency_level = "Strong",
97+
)
98+
print("\nSearch results for '神隠し' (Multi-Language Analyzer):")
99+
for result in results_multi_jp[0]:
100+
print(result)
101+
102+
results_multi_en = client.search(
103+
collection_name=COLLECTION_NAME,
104+
data=["Rings"],
105+
limit=2,
106+
output_fields=["text"],
107+
search_params={"metric_type": "BM25", "analyzer_name": "English"}, # Specify English analyzer for query
108+
consistency_level = "Strong",
109+
)
110+
print("\nSearch results for 'Rings' (Multi-Language Analyzer):")
111+
for result in results_multi_en[0]:
112+
print(result)
113+
114+
client.drop_collection(collection_name=COLLECTION_NAME)
115+
print(f"Collection '{COLLECTION_NAME}' dropped.")

0 commit comments

Comments
 (0)