Цель кода — сравнить данные двух CSV-файлов. Когда он обнаруживает, что данные в XYZ.csv отличаются от данных в ABC.csv, он генерирует CSV-файлы обновлений, которые мы можем загрузить в систему.
Код работает нормально, когда я использую 4–5 тестовых записей. Однако обычно мы работаем с более чем 20 000 записями. Когда я пытаюсь запустить код с использованием этих наборов данных, он работает непрерывно, не выдавая ни ошибок, ни результатов.
Я даже оставил его включенным более часа, но все равно ничего. Никакой ошибки. Нет результата.
Я пробовал использовать Modin для ускорения обработки, но он тоже работает непрерывно, не выдавая никаких результатов или ошибок.
Мои вопросы:
- Это нормально? Неужели мой набор данных настолько велик, что для его работы требуются часы?
- Нужна ли оптимизация моего кода или другая библиотека, которая может работать быстрее?
- Данные между двумя CSV-файлами совершенно асимметричны. В файле XYZ.csv гораздо больше записей, поскольку он содержит учетные записи, которые не подлежат обновлению (у нас нет возможности легко исключить те, которые нам не нужны). Может быть, что-то в данных вызывает блокировку?
import concurrent.futures
from fuzzywuzzy import fuzz
def fuzzy_match(s1, s2, threshold=85):
return fuzz.ratio(s1, s2) >= threshold
def match_rows(index, xyz_row):
match = abc[abc["Emp No"] == xyz_row["Employee"]]
abc_row = None
if not match.empty:
abc_row = match.iloc[0]
else:
# Fuzzy matching using First Name, Last Name, and Email
fuzzy_matched_rows = abc.apply(lambda x: fuzzy_match(str(xyz_row["First Name"]) + " " + str(xyz_row["Last Name"]), str(x["First Name"]) + " " + str(x["Last Name"])) and (pd.isna(xyz_row["Email"]) or fuzzy_match(str(xyz_row["Email"]), str(x["Email"]))), axis=1)
abc_fuzzy_matched = abc[fuzzy_matched_rows]
if not abc_fuzzy_matched.empty:
abc_row = abc_fuzzy_matched.iloc[0]
return xyz_row, abc_row
abc = pd.read_csv("abc.csv", low_memory=False)
xyz = pd.read_csv("xyz.csv", low_memory=False)
def is_data_different(xyz_row, abc_row):
if xyz_row["First Name"] != abc_row["First Name"]:
return True
if xyz_row["Last Name"] != abc_row["Last Name"]:
return True
if xyz_row["Employee"] != abc_row["Emp No"]:
return True
if xyz_row["User's Organisation Name(s)"] != abc_row["Org L13"]:
return True
if xyz_row["User's Job Title(s)"] != abc_row["Poslevel2"]:
return True
if xyz_row["Assignment Status"] != abc_row["Assignment Status"]:
return True
if not pd.isna(xyz_row["Email"]) and xyz_row["Email"] != abc_row["Email"]:
return True
# Add more field comparisons as needed
return False
usermatch_columns = ["idnumber", "timemodified", "username", "deleted", "firstname", "lastname", "email", "town", "auth", "customfield_hierarchypath", "customfield_orgframework", "customfield_occcode", "customfield_posno", "customfield_title", "customfield_middlename", "customfield_fte", "customfield_employee", "customfield_payscale", "customfield_assignmentstartdate", "customfield_areaofwork", "customfield_maidenname", "customfield_assignmentstatus", "customfield_ProfessionalNumber", "customfield_TrainingGrade", "customfield_PatientHandler", "customfield_WorkswithChildren", "customfield_Clinical", "customfield_Frontline"]
jobassign_columns = ["idnumber", "useridenumber", "timemodified", "deleted", "fullname", "orgidnumber", "posidnumber", 'manageridnumber']
usermatch = pd.DataFrame(columns=usermatch_columns)
jobassign = pd.DataFrame(columns=jobassign_columns)
manager_username = None
# Add userupload_columns and userupload DataFrame
userupload_columns = ["username", "firstname", "lastname", "email", "auth", "idnumber"]
userupload = pd.DataFrame(columns=userupload_columns)
manager_idnumber_dict = dict(zip(abc["Emp No"], abc["Manager Employee Number"]))
with concurrent.futures.ThreadPoolExecutor() as executor:
matched_rows = list(executor.map(match_rows, *zip(*xyz.iterrows())))
for xyz_row, abc_row in matched_rows:
if abc_row is not None:
# Check if User ID Number is empty, and use the capitalized version of the username if it is
if pd.isna(xyz_row["User ID Number"]) or xyz_row["User ID Number"] == "":
xyz_row["User ID Number"] = xyz_row["Username"].title()
manager_id = abc_row["Manager Employee Number"]
manager_username = xyz.loc[xyz["Employee"] == manager_id, "Username"]
manager_username = manager_username.iloc[0] if not manager_username.empty else None
if is_data_different(xyz_row, abc_row):
# Populate usermatch.csv
usermatch_row = {
"idnumber": xyz_row["User ID Number"],
"timemodified": 0,
"username": xyz_row["Username"],
"deleted": 0,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"town": abc_row["City"],
"auth": "manual",
"customfield_hierarchypath": f'{abc_row["Org L3"]}~{abc_row["Org Level 3"]}~{abc_row["Org L6"]}~{abc_row["Org L13"]}',
"customfield_orgframework": "UHSussex",
"customfield_occcode": abc_row["Occ Code"],
"customfield_posno": abc_row["Position No"],
"customfield_title": abc_row["Title"],
"customfield_middlename": abc_row["Middle Name"],
"customfield_fte": abc_row["FTE"],
"customfield_employee": abc_row["Emp No"],
"customfield_payscale": abc_row["Pay Scale"],
"customfield_assignmentstartdate": abc_row["Assign Start Date"],
"customfield_areaofwork": abc_row["Area Of Work"],
"customfield_maidenname": abc_row["Maiden Name"],
"customfield_assignmentstatus": abc_row["Assignment Status"],
"customfield_ProfessionalNumber": abc_row["Professional Registration Num"],
"customfield_TrainingGrade": "",
"customfield_PatientHandler": 0,
"customfield_WorkswithChildren": 0,
"customfield_Clinical": 0,
"customfield_Frontline": 0
}
usermatch = pd.concat([usermatch, pd.DataFrame([usermatch_row], columns=usermatch_columns)], ignore_index=True)
# Populate jobassign.csv
jobassign_row = {
"idnumber": abc_row["Emp No"],
"useridenumber": xyz_row["User ID Number"],
"timemodified": 0,
"deleted": 0,
"fullname": abc_row["Position Title"],
"orgidnumber": abc_row["Org L13"],
"posidnumber": abc_row["Poslevel2"],
"manageridnumber": manager_username
}
jobassign = pd.concat([jobassign, pd.DataFrame([jobassign_row], columns=jobassign_columns)], ignore_index=True)
# Populate userupload.csv
userupload_row = {
"username": xyz_row["Username"],
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"auth": "manual",
"idnumber": xyz_row["User ID Number"],
}
userupload = pd.concat([userupload, pd.DataFrame([userupload_row], columns=userupload_columns)], ignore_index=True)
def has_fuzzy_match(abc_row):
for _, xyz_row in xyz.iterrows():
if fuzzy_match(abc_row["First Name"] + " " + abc_row["Last Name"], xyz_row["First Name"] + " " + xyz_row["Last Name"]) and (pd.isna(abc_row["Email"]) or fuzzy_match(abc_row["Email"], xyz_row["Email"])):
return True
return False
# Step 1: Create a new DataFrame called newusers with the same columns as userupload.csv
newusers_columns = ["username", "firstname", "lastname", "email", "auth", "idnumber"]
newusers = pd.DataFrame(columns=newusers_columns)
# Step 2: Iterate through the rows of abc.csv and check if each row exists in xyz.csv
for _, abc_row in abc.iterrows():
if abc_row["Emp No"] not in xyz["Employee"].values and not has_fuzzy_match(abc_row):
# Step 3: Generate a username from the email address and populate the newusers.csv file
username = abc_row["Email"].split('@')[0] if pd.notna(abc_row["Email"]) else ""
newusers_row = {
"username": username,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": abc_row["Email"],
"auth": "manual",
"idnumber": username.title(),
}
newusers = pd.concat([newusers, pd.DataFrame([newusers_row], columns=userupload_columns)], ignore_index=True)
# Add new user data to usermatch and jobassign DataFrames
manager_id = abc_row["Manager Employee Number"]
manager_username = xyz.loc[xyz["Employee"] == manager_id, "Username"]
manager_username = manager_username.iloc[0] if not manager_username.empty else None
usermatch_row = {
"idnumber": username.title(),
"timemodified": 0,
"username": xyz_row["Username"],
"deleted": 0,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"town": abc_row["City"],
"auth": "manual",
"customfield_hierarchypath": f'{abc_row["Org L3"]}~{abc_row["Org Level 3"]}~{abc_row["Org L6"]}~{abc_row["Org L13"]}',
"customfield_orgframework": "UHSussex",
"customfield_occcode": abc_row["Occ Code"],
"customfield_posno": abc_row["Position No"],
"customfield_title": abc_row["Title"],
"customfield_middlename": abc_row["Middle Name"],
"customfield_fte": abc_row["FTE"],
"customfield_employee": abc_row["Emp No"],
"customfield_payscale": abc_row["Pay Scale"],
"customfield_assignmentstartdate": abc_row["Assign Start Date"],
"customfield_areaofwork": abc_row["Area Of Work"],
"customfield_maidenname": abc_row["Maiden Name"],
"customfield_assignmentstatus": abc_row["Assignment Status"],
"customfield_ProfessionalNumber": abc_row["Professional Registration Num"],
"customfield_TrainingGrade": "",
"customfield_PatientHandler": 0,
"customfield_WorkswithChildren": 0,
"customfield_Clinical": 0,
"customfield_Frontline": 0
}
usermatch = pd.concat([usermatch, pd.DataFrame([usermatch_row], columns=usermatch_columns)], ignore_index=True)
jobassign_row = {
"idnumber": abc_row["Emp No"],
"useridenumber": username.title(),
"timemodified": 0,
"deleted": 0,
"fullname": abc_row["Position Title"],
"orgidnumber": abc_row["Org L13"],
"posidnumber": abc_row["Poslevel2"],
# other fields from the original code
"manageridnumber": manager_username
}
jobassign = pd.concat([jobassign, pd.DataFrame([jobassign_row], columns=jobassign_columns)], ignore_index=True)
# Save the output CSVs
usermatch.to_csv("usermatch.csv", index=False)
jobassign.to_csv("jobassign.csv", index=False)
userupload.to_csv("userupload.csv", index=False)
newusers.to_csv("newusers.csv", index=False)
Подробнее здесь: https://stackoverflow.com/questions/761 ... e-datasets
Мобильная версия