Нужна помощь по очистке и предварительной обработке данных в моем проекте

Нужна помощь по очистке и предварительной обработке данных в моем проекте ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Нужна помощь по очистке и предварительной обработке данных в моем проекте

Цитата

Сообщение Anonymous » 12 дек 2025, 23:37

import pandas as pd
import numpy as np
import seaborn as sns

Код: Выделить всё

df=pd.read_excel('Online_Retail.xlsx')
#Alternative Method to Introduce Missing Values: Using numpy.random.rand
# Make a copy of the original DataFrame to avoid modifying 'df_10' further
df_10 = df.copy()

# Define the columns where missing values will be introduced
columns_to_corrupt = ['Quantity', 'UnitPrice', 'CustomerID']

# Define the percentage of missing values to introduce (e.g., 20%)
missing_percentage = 0.20

for col in columns_to_corrupt:
# Generate a boolean array where True indicates where to place a NaN
# The size of the array matches the number of rows in the DataFrame
# We compare np.random.rand with missing_percentage to get the desired proportion
mask = np.random.rand(len(df_10)) < missing_percentage

# Apply the mask to the column to set values to NaN
df_10.loc[mask, col] = np.nan

print("DataFrame with missing values introduced using np.random.rand:")
print(df_10.head())

print("\nNumber of missing values per column in df_10:")
print(df_10[columns_to_corrupt].isnull().sum())

Код: Выделить всё

# The dataframe being used in this cell is 'df_10'
df_10['InvoiceDate'] = pd.to_datetime(df_10['InvoiceDate'])
df_10['InvoiceDate'] = df_10['InvoiceDate'].dt.strftime('%Y-%m-%d')

Код: Выделить всё

#Encode 'StockCode' column to numerical format using a loop for mapping

# Method 1: For loop with dictionary mapping
country_mapping = {}
current_code = 1
country_codes = []

# Iterate through each country
for country in df_10['Country']:
if country not in country_mapping:
country_mapping[country] = current_code
current_code += 1
country_codes.append(country_mapping[country])

# Add new column
df_10['Country_Code_ForLoop'] = country_codes

print("\nUsing For Loop:")
print(df_10)
print(f"\nMapping Dictionary: {country_mapping}")

Код: Выделить всё

###### stockcode to integer value
# 1) Read Excel file
# This line is commented out as df_10 is already defined earlier.
# df_10 = pd.read_excel("Online_Retail.xlsx", sheet_name="Online Retail")

# 2) Convert StockCode to string using FOR LOOP (your requested way)
new_list = []
for x in df_10["StockCode"]:
new_list.append(str(x))

df_10["StockCode"] = new_list

# 3) Manual Label Encoding (NO inbuilt encoders)
stock_map = {}       # stores StockCode -> number
encoded_list = []    # stores encoded values
next_id = 1

for code in df_10["StockCode"]:
if code not in stock_map:
stock_map[code] = next_id
next_id += 1
encoded_list.append(stock_map[code])

# 4) Add encoded column
df_10["StockCode_encoded"] = encoded_list

Код: Выделить всё

df_10.to_csv('dff.csv', index=False)

Код: Выделить всё

df_10.to_csv('dff.csv', index=False)

постановка проблемы:
· Обработайте пропущенные значения, используя соответствующие стратегии вменения (среднее, медианное, режим или более продвинутые методы).
· Выявите и обработайте повторяющиеся или ошибочные транзакции (например, отмены, отмеченные номерами счетов-фактур, начинающимися с буквы «c»).
это изображение моего набора данных для справки по данным и для постановки проблемы также:

введите сюда описание изображения

введите здесь описание изображения
Я очень благодарен, если кто-нибудь мне поможет

Подробнее здесь: https://stackoverflow.com/questions/798 ... my-project

1765571835

Anonymous

[code]import pandas as pd
import numpy as np
import seaborn as sns
[/code]
[code]df=pd.read_excel('Online_Retail.xlsx')
#Alternative Method to Introduce Missing Values: Using numpy.random.rand
# Make a copy of the original DataFrame to avoid modifying 'df_10' further
df_10 = df.copy()

# Define the columns where missing values will be introduced
columns_to_corrupt = ['Quantity', 'UnitPrice', 'CustomerID']

# Define the percentage of missing values to introduce (e.g., 20%)
missing_percentage = 0.20

for col in columns_to_corrupt:
# Generate a boolean array where True indicates where to place a NaN
# The size of the array matches the number of rows in the DataFrame
# We compare np.random.rand with missing_percentage to get the desired proportion
mask = np.random.rand(len(df_10)) < missing_percentage

# Apply the mask to the column to set values to NaN
df_10.loc[mask, col] = np.nan

print("DataFrame with missing values introduced using np.random.rand:")
print(df_10.head())

print("\nNumber of missing values per column in df_10:")
print(df_10[columns_to_corrupt].isnull().sum())
[/code]
[code]# The dataframe being used in this cell is 'df_10'
df_10['InvoiceDate'] = pd.to_datetime(df_10['InvoiceDate'])
df_10['InvoiceDate'] = df_10['InvoiceDate'].dt.strftime('%Y-%m-%d')
[/code]
[code]#Encode 'StockCode' column to numerical format using a loop for mapping

# Method 1: For loop with dictionary mapping
country_mapping = {}
current_code = 1
country_codes = []

# Iterate through each country
for country in df_10['Country']:
if country not in country_mapping:
country_mapping[country] = current_code
current_code += 1
country_codes.append(country_mapping[country])

# Add new column
df_10['Country_Code_ForLoop'] = country_codes

print("\nUsing For Loop:")
print(df_10)
print(f"\nMapping Dictionary: {country_mapping}")
[/code]
[code]###### stockcode to integer value
# 1) Read Excel file
# This line is commented out as df_10 is already defined earlier.
# df_10 = pd.read_excel("Online_Retail.xlsx", sheet_name="Online Retail")

# 2) Convert StockCode to string using FOR LOOP (your requested way)
new_list = []
for x in df_10["StockCode"]:
new_list.append(str(x))

df_10["StockCode"] = new_list

# 3) Manual Label Encoding (NO inbuilt encoders)
stock_map = {}       # stores StockCode -> number
encoded_list = []    # stores encoded values
next_id = 1

for code in df_10["StockCode"]:
if code not in stock_map:
stock_map[code] = next_id
next_id += 1
encoded_list.append(stock_map[code])

# 4) Add encoded column
df_10["StockCode_encoded"] = encoded_list
[/code]
[code]df_10.to_csv('dff.csv', index=False)
[/code]
[code]df_10.to_csv('dff.csv', index=False)
[/code]
[list]
[*]постановка проблемы:
· Обработайте пропущенные значения, используя соответствующие стратегии вменения (среднее, медианное, режим или более продвинутые методы).
· Выявите и обработайте повторяющиеся или ошибочные транзакции (например, отмены, отмеченные номерами счетов-фактур, начинающимися с буквы «c»).
это изображение моего набора данных для справки по данным и для постановки проблемы также:

введите сюда описание изображения

введите здесь описание изображения
Я очень благодарен, если кто-нибудь мне поможет

[/list] 

Подробнее здесь: [url]https://stackoverflow.com/questions/79845229/need-help-for-data-cleaning-and-preprocessing-in-my-project[/url]