AllCorrect DA Project
df = pd.read_csv('/content/drive/My Drive/data/df.csv', index_col=0)
df.columns = ['id', 'score', 'text']
df.shape
df.head()
df = df.dropna()
df = df.drop_duplicates().reset_index(drop=True)
keywords = pd.read_excel('/content/drive/My Drive/data/Keywords.xlsx')
keywords
keywords = keywords.drop(29, axis=0)
keywords['Language combination'] = keywords['Language combination'].str.replace('English - ', '')
keywords_list = keywords['"Translation"'].tolist() + keywords['"Language"'].tolist() + keywords['"Localization"'].tolist() + keywords['"English"'].tolist() + keywords['Language'].tolist() + keywords['Language combination'].tolist()
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stemmer_rus = SnowballStemmer('russian')
keywords_stem = [stemmer.stem(word) for word in keywords_list]
keywords_stem = keywords_stem + ['翻译', '语言', '中文', '中国', '汉语', '本土化', '英语', '版本',
'区域', '跨地区', '地区', '华语', '汉化', '英文', '国家', '国服', 'VPN', 'русс', 'англ']
keywords_stem = [keywords_stem[i].replace(' ', '') for i in range(len(keywords_stem))]
keywords_string = '|'.join(keywords_stem)
df['is_about_localization'] = df['text'].str.lower().str.contains(keywords_string.lower())
df.head()
df_loc = df[df['is_about_localization']==True]
df_loc.shape
df_loc
def clear_text(text):
new_text = re.sub(r"[^a-zA-Z']", " ", text)
new_text = new_text.split()
new_text = " ".join(new_text)
return new_text.lower()
def determine_language(text):
try:
return langdetect.detect(text)
except:
return None
df_loc['language'] = df_loc['text'].apply(determine_language)
df_loc['language'].unique()
df_loc[df_loc['language'].isnull()]
df_loc['language'] = df_loc['language'].fillna('en')
df_loc.groupby('language')['language'].count().sort_values()
df_loc['language'] = df_loc['language'].str.replace('zh-tw', 'zh')
df_loc['language'] = df_loc['language'].str.replace('zh-cn', 'zh')
table_lang_and_codes = keywords[['Language combination', 'ISO 639-1']]
table_lang_and_codes['ISO 639-1'] = table_lang_and_codes['ISO 639-1'].str.lower()
table_lang_and_codes.columns = ['language_full', 'language']
table_lang_and_codes['language'] = table_lang_and_codes['language'].drop_duplicates()
table_lang_and_codes = table_lang_and_codes.dropna()
table_lang_and_codes['language_full'] = table_lang_and_codes['language_full'].str.replace('Chinese Simplified', 'Chinese')
table_lang_and_codes['language_full'] = table_lang_and_codes['language_full'].where(table_lang_and_codes['language_full'] != 'Portuguese (BRZ)', 'Portuguese')
table_lang_and_codes
df_loc_new = df_loc.merge(table_lang_and_codes, on='language', how='left')
df_loc_new.head()
df_loc_new[df_loc_new['language_full'].isnull()].groupby('language')['language'].count()
There are some reviews on languages we didn't have in keywords lists, but they include words are common with languages we do have in keywords. We can translate them to english to process them futher.
df_loc_known = df_loc_new.dropna()
df_loc_known.shape
# translator = GoogleTranslator(source='auto', target='en')
# def translate_to_en(text):
# try:
# return translator.translate(text)
# except:
# return None
# df_loc_known['en_trans'] = df_loc_known['text'].apply(translate_to_en)
df_loc_known = pd.read_csv('/content/drive/My Drive/data/df_loc_known.csv', index_col=0)
df_loc_known[df_loc_known['en_trans'].isnull()]['language'].count()
We have only 343 reviews without translation. They can be used for test our future model.
df_final = df_loc_known[df_loc_known['en_trans'].isnull()==False]
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
df_final['sentiment_vader'] = df_final['en_trans'].apply(lambda text: sid.polarity_scores(text))
for i in range(20,30):
print('Row ', i)
print(df_final.loc[i, 'en_trans'])
print(df_final.loc[i, 'sentiment_vader']['compound'])
print('')
df_final['compound'] = df_final['en_trans'].apply(lambda text: sid.polarity_scores(text)['compound'])
df_final['score_sentiment'] = pd.cut(df_final['compound'], 5, labels=[1, 2, 3, 4, 5]).astype(int)
Compound is a balanced valuation of a sentiment, that takes in account both negative and posive values. That is why it's used for further analysis.
df_final['is_pos'] = (df_final['compound'] >= 0).astype(int)
games_with_zero = df_final[(df_final['score'] < 1) & (df_final['id'].count() > 1)]['id'].unique().tolist()
for i in games_with_zero:
print(i, 'Scores:', df_final[df_final['id'] == i]['score'].unique(), 'Number of reviews:', df_final[df_final['id'] == i]['id'].count())
print('')
All games that have 0 as a score have score 1 in addition. Probably there was kind of "do you like/ not like our game" question in a game. That's why there are 22 games with only 2 scores.
avrg_score = pd.DataFrame(df_final.groupby('id')['score'].mean())
avrg_score_sentiment = pd.DataFrame(df_final.groupby('id')['score_sentiment'].mean())
avrg_compound_sentiment = pd.DataFrame(df_final.groupby('id')['is_pos'].mean())
scores_all = avrg_score[(~avrg_score.index.isin(games_with_zero)) & (df_final.groupby('id')['score'].count() > 1)].merge(avrg_score_sentiment, on='id')
scores_all = scores_all.round(3)
scores_all.sort_values(by='score_sentiment').head()
compound_all = avrg_score[avrg_score.index.isin(games_with_zero)].merge(avrg_compound_sentiment, on='id')
compound_all = compound_all.round(3)
compound_all.sort_values(by='is_pos').head()
scores_all[:50].plot.bar(figsize=(30, 10))
review_count = df_final[~df_final['id'].isin(games_with_zero)].groupby('id')['text'].count().reset_index()
review_count.columns = ['id', 'text_count']
review_count.head()
scores_all = scores_all.merge(review_count, on='id').sort_values(by=['score_sentiment'], ascending=True)
scores_all[:10]
df_final.head()
neg_review_count = df_final[df_final['is_pos']==0].groupby('id')['text'].count().reset_index().sort_values(by='text', ascending=False)
neg_review_count.head(10)
There are 10 games with most count of negative location reviews. Let's see what languages asked more.
Usually the primary language for a new game is an English, so I assume that people that wrote reviews in English are not asked about the translation. Of course it's not true, but let's do it to have a fast overview.
df_to_dashboard_negative_top = df_final[(df_final['id'].isin(neg_review_count['id'].tolist()[:10])) & (df_final['is_pos']==0) & (df_final['language_full']!='English')][['id', 'language_full', 'score_sentiment', 'is_pos']]
for i in neg_review_count['id'].tolist()[:10]:
df = df_final[(df_final['id']==i) & (df_final['is_pos']==0) & (df_final['language_full']!='English')].groupby('language_full')['text'].count().reset_index().sort_values(by='text', ascending=False)
print("game:", i)
print(df.head(10))
print('')
top_lang = df['language_full'][:4]
df_to_dashboard_negative_top['language_full'] = np.where(~(df_to_dashboard_negative_top['language_full'].isin(top_lang)) & (df_to_dashboard_negative_top['id'] == i), 'Other', df_to_dashboard_negative_top['language_full'])
popular_games_without_loc = scores_all.sort_values(by=['text_count'], ascending=False)['id'][:10].tolist()
popular_games_without_loc
df_final[df_final['id'].isin(popular_games_without_loc)].groupby('language_full')['language_full'].count().sort_values(ascending=False)
df_final.groupby('language_full')['language_full'].count().sort_values(ascending=False)
Conclusion
We do not really know the reasons for negative reviews. VADER rely on words' emotional, so the review is negative because the reviewer used negative emotion words.
We do know the sentiment of all reviews. So we can assume that a negative review tells about a problem with a language the review was written on.
The languages, the most localization reviews are written, are English, Turkish, Portuguese, Thai, Russian, Chinese, Korean, French, German, Italian and Indonesian.