AllCorrect DS Project
Let's import all needed libs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import torch
import transformers
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import re
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from catboost import CatBoostClassifier, Pool
import joblib
df = pd.read_excel(path)
df.info()
df.iloc[0]
lemmatizer = WordNetLemmatizer()
def clean_text(text):
#lower
text = text.strip().lower()
#clear the text
text = re.sub(r"[^a-zA-Z']+", ' ', text)
#lemmatization
tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(word) for word in tokens]
return " ".join(lemmas)
df['review'] = df['review'].apply(clean_text)
df['review_len'] = df['review'].str.len()
df = df.sort_values('review_len',axis=0)
df.head()
df[df['review_len'] == 0]['review_len'].count()
df = df[df['review_len'] != 0]
df['mark'] = df['mark'].str.upper()
df['mark'].unique()
encoder = LabelEncoder()
df['mark_num'] = encoder.fit_transform(df['mark'])
df['mark_num'].unique()
df['mark'].value_counts(normalize=True).plot(kind='pie')
df[(df['review_len'] < 2000) & (df['review_len'] > 250)]['review_len'].hist(bins=100,density=True)
The data is unbalanced. There is 82% of the one type out of four.
Most of reviews are not longer than 500 signs.
stop_words = set(stopwords.words('english'))
corpus = df['review']
X = df['review']
y = df['mark_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
count_tf_idf = TfidfVectorizer(stop_words = stop_words)
tf_idf_train = count_tf_idf.fit_transform(X_train)
tf_idf_test = count_tf_idf.transform(X_test)
lr_model = LogisticRegression(multi_class='ovr', solver='liblinear')
lr_model.fit(tf_idf_train, y_train)
y_pred = lr_model.predict(tf_idf_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))
print(accuracy_score(y_test, y_pred))
X = df[['review', 'id']]
y = df['mark_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_pool = Pool(data=X_train,
label=y_train,
text_features=['review'])
valid_pool = Pool(data=X_test,
label=y_test,
text_features=['review'])
model_CB = CatBoostClassifier(loss_function='MultiClass')
# joblib.dump(model_CB, '/content/drive/My Drive/data/model_CB.joblib')
model_CB = joblib.load('/content/drive/My Drive/data/model_CB.joblib')
y_pred = model_CB.predict(X_test)
# print(model_CB.get_best_score())
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))
print(accuracy_score(y_test, y_pred))
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')
def BERT_process(texts, max_length=512, batch_size=100, disable_progress_bar=True):
ids_list = []
attention_mask_list = []
# text to padded IDs of tokens along with their attention masks
for input_text in tqdm(texts, disable=disable_progress_bar):
ids = bert_tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
padded = np.array(ids + [0]*(max_length - len(ids)))
attention_mask = np.where(padded != 0, 1, 0)
ids_list.append(padded)
attention_mask_list.append(attention_mask)
# use cuda if possible:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
# gettings embeddings in batches
embeddings = []
for i in tqdm(range(math.ceil(len(ids_list)/batch_size)), disable=disable_progress_bar):
ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)
with torch.no_grad():
bert_model.eval()
batch_embeddings = bert_model(input_ids=ids_batch, attention_mask=attention_mask_batch)
embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())
return np.concatenate(embeddings)
# np.savez_compressed('/content/drive/My Drive/data/X_BERT.npz', X=X)
with np.load('/content/drive/My Drive/data/X_BERT.npz') as data:
X = data['X']
y = df['mark_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.5)
y_train = np.asarray(y_train).astype(np.float32)
train_dataset = tf.data.Dataset.from_tensor_slices((
X_train,
y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
X_test,
y_test
))
model = keras.models.Sequential()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
model.add(tf.keras.layers.Dense(16, activation="relu"))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.Dense(128, activation='softmax'))
# model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
model.compile(
optimizer=optimizer, # Optimizer
# Loss function to minimize
loss=keras.losses.SparseCategoricalCrossentropy(),
# List of metrics to monitor
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16, validation_data=(X_val, y_val))
y_pred = model.predict(X_test, verbose=True).argmax(axis=-1)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))
print(accuracy_score(y_test, y_pred))
Conclusion
The score of deep-learning model + BERT is the lest than accuracy of a simple LogisticRegression model.
Probably it's because the data is unbalanced. If the data are a bit more balanced, perhaps we would have a better result.
I decided to continue with already trained Catboost model, that showed the best result, and create python script for production.
Python script
Code of the python script to classify users' reviews.
Input:
- CatBoost model
- Excel file with two columns. First column is unique identifier of the review, second column is review's text.
Output:
- Excel file with six columns. First two columns remain from the input file, other columns contain probability to belong to each category.
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import joblib
import pandas as pd
from optparse import OptionParser
import re
parser = OptionParser()
parser.add_option("-t", "--text_file",
action="store", type="string", dest="text_path")
parser.add_option("-m", "--model_file",
action="store", type="string", dest="model_path")
(options, args) = parser.parse_args()
lemmatizer = WordNetLemmatizer()
def clean_text(text):
#lower
text = text.strip().lower()
#clear the text
text = re.sub(r"[^a-zA-Z']+", ' ', text)
#lemmatization
tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(word) for word in tokens]
return " ".join(lemmas)
def predict_mark(text):
model = joblib.load(model_path)
y_pred = model.predict(text_preprocessed)
return y_pred
df = pd.read_excel(text_path)
df['review'] = df['review'].apply(clean_text)
df['mark'] = df['review'].apply(predict_mark)
df.to_excel("output.xlsx")