Код: Выделить всё
df = pd.read_csv('data/cyberbullying_tweets.csv')Первый метод:
Код: Выделить всё
import re
# Define the pattern for valid hashtags
hashtag_pattern = r'#[A-Za-z0-9]+'
# Function to count the total number of hashtags in a dataframe
def count_total_hashtags(dataframe):
return dataframe['tweet_text'].str.findall(hashtag_pattern).apply(len).sum()
for category in df['cyberbullying_type'].unique():
count = count_total_hashtags(df[df['cyberbullying_type'] == category])
print(f"Number of hashtags in all tweets for the '{category}' category: {count}")
Второй метод:
Следующий метод более ручной:
Код: Выделить всё
def count_hashtags_by_category(dataframe):
hashtag_counts = {}
for category in dataframe['cyberbullying_type'].unique():
# Filter tweets by category
category_tweets = dataframe[dataframe['cyberbullying_type'] == category]
# Count hashtags in each tweet
hashtag_counts[category] = category_tweets['tweet_text'].apply(
lambda text: sum(1 for word in text.split() if word.startswith('#') and word[1:].isalnum())
).sum()
return hashtag_counts
# Count hashtags for each category
hashtags_per_category = count_hashtags_by_category(df)
print(hashtags_per_category)
Почему ответы различаются?
Подробнее здесь: https://stackoverflow.com/questions/793 ... nsistent-r
Мобильная версия