Найден лучший матч [закрыто] - Цифровое Кемерово

Найден лучший матч [закрыто] ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 09 янв 2025, 02:07

У меня есть проблема, чтобы решить ее: у меня есть база данных с 10 миллионами регистров, в основном именами людей, для определения пола по имени, в реестре есть несколько проблем, таких как орфографические ошибки, имена без пробелов или пробелы между буквы и слова без смысла, например: «XYZTU», «K A REN», «ГУАДОЛПЕ», «АРМАНДОЕДУАРДО», «КЕНИАМАРИЯ», «РО ДРИ Г». О','ФРЭНСИС КО') задача состоит в том, чтобы определить пол на основе правил, с другой стороны, у меня есть база данных с именами в качестве ссылок, разделенных по полу, я реализовал такие инструменты, как rapifuzz и faiss мой код:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rapidfuzz import process, fuzz
import faiss

def find_candidates_with_faiss(query, top_k=5):
"""
Find top-k nearest neighbors for the query using Faiss.
"""
query_embedding = vectorizer.transform([query]).toarray().astype(np.float32)
distances, indices = faiss_index.search(query_embedding, top_k)
return [(all_names, distances[0][j]) for j, i in enumerate(indices[0])]

def is_valid_name(query, match):
"""
Check if the matched name is valid based on length and similarity criteria.
"""
# Length constraints
if len(query) > 30: # Unusually long strings
return False
if len(match) < 4: # Matches that are too short
return False

# Proximity check: Match should cover at least 60% of the query length
if len(query) / len(match) < 0.6:
return False

return True

def segment_and_match(query):
"""
Segment the query into potential name components and match each segment.
"""
matches = []
remaining_query = query

while remaining_query:
# Find candidates using FAISS
faiss_candidates = find_candidates_with_faiss(remaining_query, top_k=5)

# Refine candidates with RapidFuzz
refined_candidates = [
(candidate[0], fuzz.WRatio(remaining_query, candidate[0]), candidate[1])
for candidate in faiss_candidates
]

# Select the best match
if refined_candidates:
best_match = max(refined_candidates, key=lambda x: x[1]) # Highest similarity
else:
break # No candidates, stop processing

match, score, _ = best_match

# Validate the match based on custom rules
if score < 90 or not is_valid_name(remaining_query, match):
break

# Add the valid match and its score to the results
matches.append((match, score, gender_mapping.get(match, "Unknown")))

# Remove the matched part from the query
start_idx = remaining_query.find(match)
if start_idx != -1:
remaining_query = remaining_query[:start_idx] + remaining_query[start_idx + len(match):]
else:
break # Avoid infinite loop if no valid substring removal is possible

return matches

def determine_gender(query):
"""
Determine gender using segmentation and refined matching logic.
"""
matches = segment_and_match(query)

if not matches:
return "Unknown", None, None
# Apply rules based on matches
genders = [match[2] for match in matches] # Extract genders from matches
if "M" in genders:
return "M", matches, None # Male dominates
elif "F" in genders:
return "F", matches, None # Female dominates
elif "U" in genders:
return "U", matches, None # Unisex dominates
return "Unknown", matches, None

def process_file(input_file, output_file, num_lines=50):
"""
Process a file line by line, drop spaces, select only the first `num_lines` lines,
and save the results (name, gender, matches, and scores) into an output file.
"""
results = []
unmatched = []

with open(input_file, "r") as file:
for i, line in enumerate(file):
if i >= num_lines: # Stop after processing `num_lines` lines
break
name = line.strip() # Remove spaces and trim the line
gender, matches, _ = determine_gender(name)

if matches:
match_str = "|".join([match[0] for match in matches])
score_str = "|".join([str(match[1]) for match in matches])
else:
match_str, score_str = None, None

results.append((name, gender, match_str, score_str))
if gender == "Unknown":
unmatched.append(name)

# Save results to the output file
with open(output_file, "w") as file:
file.write("Query,Gender,Matches,Scores\n") # Add header
for name, gender, match_str, score_str in results:
file.write(f"{name},{gender},{match_str},{score_str}\n")

# Save unmatched names for review
with open("unmatched_names.txt", "w") as file:
file.write("\n".join(unmatched))

Мой вопрос заключается в том, как можно улучшить его, чтобы разрешить такие случаи, как: бессмысленные слова, которые с вероятностью совпадают с именем ссылки, например:
name,gender,match,score
PERICO,M,ERICO,90.9090909090909
ADADAD,M,ADAD,90.0
RAYDESEL,M,AYDE|ARSELI,90.0|90.0
AMAMAMAMAMAM,F,MAMA|MAMA|HAMAMA,90.0|90.0|90.0
JAINIK,M,JAIN,90.0
ALVIO,M,SALVIO,90.9090909090909

Подробнее здесь: https://stackoverflow.com/questions/793 ... best-match

1736377621

Anonymous

У меня есть проблема, чтобы решить ее: у меня есть база данных с 10 миллионами регистров, в основном именами людей, для определения пола по имени, в реестре есть несколько проблем, таких как орфографические ошибки, имена без пробелов или пробелы между буквы и слова без смысла, например: «XYZTU», «K A REN», «ГУАДОЛПЕ», «АРМАНДОЕДУАРДО», «КЕНИАМАРИЯ», «РО ДРИ Г».  О','ФРЭНСИС КО') задача состоит в том, чтобы определить пол на основе правил, с другой стороны, у меня есть база данных с именами в качестве ссылок, разделенных по полу, я реализовал такие инструменты, как [b]rapifuzz и faiss[/b] мой код:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rapidfuzz import process, fuzz
import faiss

def find_candidates_with_faiss(query, top_k=5):
"""
Find top-k nearest neighbors for the query using Faiss.
"""
query_embedding = vectorizer.transform([query]).toarray().astype(np.float32)
distances, indices = faiss_index.search(query_embedding, top_k)
return [(all_names[i], distances[0][j]) for j, i in enumerate(indices[0])]

def is_valid_name(query, match):
"""
Check if the matched name is valid based on length and similarity criteria.
"""
# Length constraints
if len(query) > 30:  # Unusually long strings
return False
if len(match) < 4:  # Matches that are too short
return False

# Proximity check: Match should cover at least 60% of the query length
if len(query) / len(match) < 0.6:
return False

return True

def segment_and_match(query):
"""
Segment the query into potential name components and match each segment.
"""
matches = []
remaining_query = query

while remaining_query:
# Find candidates using FAISS
faiss_candidates = find_candidates_with_faiss(remaining_query, top_k=5)

# Refine candidates with RapidFuzz
refined_candidates = [
(candidate[0], fuzz.WRatio(remaining_query, candidate[0]), candidate[1])
for candidate in faiss_candidates
]

# Select the best match
if refined_candidates:
best_match = max(refined_candidates, key=lambda x: x[1])  # Highest similarity
else:
break  # No candidates, stop processing

match, score, _ = best_match

# Validate the match based on custom rules
if score < 90 or not is_valid_name(remaining_query, match):
break

# Add the valid match and its score to the results
matches.append((match, score, gender_mapping.get(match, "Unknown")))

# Remove the matched part from the query
start_idx = remaining_query.find(match)
if start_idx != -1:
remaining_query = remaining_query[:start_idx] + remaining_query[start_idx + len(match):]
else:
break  # Avoid infinite loop if no valid substring removal is possible

return matches

def determine_gender(query):
"""
Determine gender using segmentation and refined matching logic.
"""
matches = segment_and_match(query)

if not matches:
return "Unknown", None, None
# Apply rules based on matches
genders = [match[2] for match in matches]  # Extract genders from matches
if "M" in genders:
return "M", matches, None  # Male dominates
elif "F" in genders:
return "F", matches, None  # Female dominates
elif "U"  in genders:
return "U", matches, None  # Unisex dominates
return "Unknown", matches, None

def process_file(input_file, output_file, num_lines=50):
"""
Process a file line by line, drop spaces, select only the first `num_lines` lines,
and save the results (name, gender, matches, and scores) into an output file.
"""
results = []
unmatched = []

with open(input_file, "r") as file:
for i, line in enumerate(file):
if i >= num_lines:  # Stop after processing `num_lines` lines
break
name = line.strip()  # Remove spaces and trim the line
gender, matches, _ = determine_gender(name)

if matches:
match_str = "|".join([match[0] for match in matches])
score_str = "|".join([str(match[1]) for match in matches])
else:
match_str, score_str = None, None

results.append((name, gender, match_str, score_str))
if gender == "Unknown":
unmatched.append(name)

# Save results to the output file
with open(output_file, "w") as file:
file.write("Query,Gender,Matches,Scores\n")  # Add header
for name, gender, match_str, score_str in results:
file.write(f"{name},{gender},{match_str},{score_str}\n")

# Save unmatched names for review
with open("unmatched_names.txt", "w") as file:
file.write("\n".join(unmatched))

Мой вопрос заключается в том, как можно улучшить его, чтобы разрешить такие случаи, как: бессмысленные слова, которые с вероятностью совпадают с именем ссылки, например:
name,gender,match,score
PERICO,M,ERICO,90.9090909090909
ADADAD,M,ADAD,90.0
RAYDESEL,M,AYDE|ARSELI,90.0|90.0
AMAMAMAMAMAM,F,MAMA|MAMA|HAMAMA,90.0|90.0|90.0
JAINIK,M,JAIN,90.0
ALVIO,M,SALVIO,90.9090909090909
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79320528/found-the-best-match[/url]