import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from string import punctuation 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z3534407/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/z3534407/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")

df_train.head()

df_test.head()

df_train.shape, df_test.shape

((1306122, 3), (375806, 2))

df_train["target"].value_counts()

target
0    1225312
1      80810
Name: count, dtype: int64

insincere = df_train[df_train["target"] == 1]
sincere   = df_train[df_train["target"] == 0]

question_class = df_train["target"].value_counts()
colors = ["#B92B27", "dodgerblue"]
question_class.plot(kind="bar", color=colors, edgecolor="black")
plt.xlabel("Question Class")
plt.ylabel("Number of Questions")
plt.title("Distribution of Question Classes in Training Data")
for i, count in enumerate(question_class):
    plt.text(i, count + 0.1, str(count), ha="center", va="bottom")
plt.show()

print(df_train["target"].value_counts())
print(sum(df_train["target"] == 1) / sum(df_train["target"] == 0) * 100, "percent of questions are insincere.")
print(100 - sum(df_train["target"] == 1) / sum(df_train["target"] == 0) * 100, "percent of questions are sincere")

target
0    1225312
1      80810
Name: count, dtype: int64
6.595054973753624 percent of questions are insincere.
93.40494502624638 percent of questions are sincere

print("Original Class Distribution:")
print(df_train["target"].value_counts())

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df_train.drop("target", axis=1), df_train["target"])

df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

print("\nBalanced Class Distribution:")
print(df_balanced["target"].value_counts())

insincere_percent = sum(df_balanced["target"] == 1) / len(df_balanced) * 100
sincere_percent = 100 - insincere_percent

print(f"\n{insincere_percent:.2f}% of questions are insincere.")
print(f"{sincere_percent:.2f}% of questions are sincere.")

Original Class Distribution:
target
0    1225312
1      80810
Name: count, dtype: int64

Balanced Class Distribution:
target
0    80810
1    80810
Name: count, dtype: int64

50.00% of questions are insincere.
50.00% of questions are sincere.

df_train = df_balanced.copy()
df_train.shape

(161620, 3)

stopwords = set(STOPWORDS)
sincere_wordcloud = WordCloud(width=600, height=400).generate(str(sincere["question_text"]))
plt.figure(figsize=(10,8), facecolor="black")
plt.imshow(sincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show();

stopwords = set(STOPWORDS)
insincere_wordcloud = WordCloud(width=600, height=400).generate(str(insincere["question_text"]))
plt.figure(figsize=(10,8), facecolor="black")
plt.imshow(insincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show();

df_train["number_words"] = df_train["question_text"].apply(lambda x: len(x.split()))
df_test["number_words"]  = df_test["question_text"].apply(lambda x: len(x.split()))

df_train["num_unique_words"] = df_train["question_text"].apply(lambda x: len(set(str(x).split())))
df_test["num_unique_words"]  = df_test["question_text"].apply(lambda x: len(set(str(x).split())))

df_train["num_chars"] = df_train["question_text"].apply(lambda x: len(str(x)))
df_test["num_chars"]  = df_test["question_text"].apply(lambda x: len(str(x)))

import nltk
from nltk.corpus import stopwords
import pandas as pd
stop_words = set(stopwords.words("english"))
df_train["num_stopwords"] = df_train["question_text"].apply(lambda x : len([nw for nw in str(x).split() if nw.lower() in stop_words]))
df_test["num_stopwords"]  = df_test["question_text"].apply(lambda x : len([nw for nw in str(x).split() if nw.lower() in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z3534407/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

df_train["num_punctuation"] = df_train["question_text"].apply(lambda x : len([np for np in str(x) if np in punctuation]))
df_test["num_punctuation"]  = df_test["question_text"].apply(lambda x : len([np for np in str(x) if np in punctuation]))

df_train["num_uppercase"] = df_train["question_text"].apply(lambda x : len([nu for nu in str(x).split() if nu.isupper()]))
df_test["num_uppercase"]  = df_test["question_text"].apply(lambda x : len([nu for nu in str(x).split() if nu.isupper()]))

df_train["num_lowercase"] = df_train["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.islower()]))
df_test["num_lowercase"]  = df_test["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.islower()]))

df_train["num_title"] = df_train["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.istitle()]))
df_test["num_title"]  = df_test["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.istitle()]))

df_train[df_train["target"] == 1].describe()

df_train[df_train["target"] == 0].describe()

fig, ax = plt.subplots(figsize=(12, 10))
colors = ["#B92B27", "dodgerblue"]
sns.set_palette(sns.color_palette(colors))
sns.boxplot(data=df_train, y="number_words", x="target", orient="v", ax=ax)
ax.set(xlabel="Target", ylabel="Number of Words", title="Box Plot of Number of Words According to the Target")
plt.show()

def text_process(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(tokens)
    return processed_text

df_train['clean_train'] = df_train["question_text"].apply(text_process)
df_train.head()

df_test['clean_test'] = df_test["question_text"].apply(text_process)
df_test

X_train,X_val,Y_train,Y_val = train_test_split(df_train['clean_train'],df_train['target'],test_size=0.2)
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape

((129296,), (32324,), (129296,), (32324,))

pipeline = Pipeline(
    [
        ("cv",CountVectorizer(analyzer="word",ngram_range=(1,4),max_df=0.9)),
        ("clf",LogisticRegression(solver="saga", class_weight="balanced", C=0.45, max_iter=250, verbose=1))
    ]
)

lr_model = pipeline.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

convergence after 164 epochs took 28 seconds

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   28.4s finished

Y_pred = lr_model.predict(X_val)
print(classification_report(Y_val,Y_pred))
cm     = confusion_matrix(Y_val,Y_pred)
sns.heatmap(cm, cmap="Blues", annot=True, square=True, fmt=".0f");

              precision    recall  f1-score   support

           0       0.86      0.90      0.88     16084
           1       0.90      0.86      0.88     16240

    accuracy                           0.88     32324
   macro avg       0.88      0.88      0.88     32324
weighted avg       0.88      0.88      0.88     32324

y_pred_final = pipeline.predict(df_test['clean_test'])
y_pred_final

array([1, 0, 0, ..., 0, 0, 1])

df_sub = pd.DataFrame({"qid":df_test["qid"], "prediction":y_pred_final})
df_sub.to_csv('submission.csv', index=False)
df_sub.head()

	qid	question_text
0	00002165364db923c7e6	How did Quebec nationalists see their province...
1	000032939017120e6e44	Do you have an adopted dog, how would you enco...
2	0000412ca6e4628ce2cf	Why does velocity affect time? Does velocity a...
3	000042bf85aa498cd78e	How did Otto von Guericke used the Magdeburg h...
4	0000455dfa3e01eae3af	Can I convert montra helicon D to a mountain b...

	qid	question_text
0	0000163e3ea7c7a74cd7	Why do so many women become so rude and arroga...
1	00002bd4fb5d505b9161	When should I apply for RV college of engineer...
2	00007756b4a147d2b0b3	What is it really like to be a nurse practitio...
3	000086e4b7e1c7146103	Who are entrepreneurs?
4	0000c4c3fbe8785a3090	Is education really making good people nowadays?

	target	number_words	num_unique_words	num_chars	num_stopwords	num_punctuation	num_uppercase	num_lowercase	num_title
count	80810.0	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000
mean	1.0	17.277812	16.037594	98.064163	8.000829	2.369905	0.326284	13.919453	2.962826
std	0.0	9.568309	8.153619	55.186227	4.918845	2.906119	0.896822	8.661324	1.971440
min	1.0	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.0	10.000000	10.000000	55.000000	4.000000	1.000000	0.000000	7.000000	2.000000
50%	1.0	15.000000	14.000000	86.000000	7.000000	2.000000	0.000000	12.000000	3.000000
75%	1.0	23.000000	21.000000	130.000000	11.000000	3.000000	0.000000	19.000000	4.000000
max	1.0	64.000000	48.000000	1017.000000	37.000000	411.000000	37.000000	56.000000	37.000000

	target	number_words	num_unique_words	num_chars	num_stopwords	num_punctuation	num_uppercase	num_lowercase	num_title
count	80810.0	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000	80810.000000
mean	0.0	12.501609	11.876104	68.821643	6.035132	1.705779	0.462542	10.017188	2.072380
std	0.0	6.727994	5.761206	36.553370	3.600207	1.564554	0.850761	6.145395	1.445841
min	0.0	2.000000	2.000000	11.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.0	8.000000	8.000000	44.000000	4.000000	1.000000	0.000000	6.000000	1.000000
50%	0.0	11.000000	10.000000	59.000000	5.000000	1.000000	0.000000	8.000000	2.000000
75%	0.0	15.000000	14.000000	82.000000	7.000000	2.000000	1.000000	12.000000	3.000000
max	0.0	56.000000	47.000000	263.000000	37.000000	96.000000	16.000000	51.000000	21.000000

	qid	question_text	number_words	num_unique_words	num_chars	num_stopwords	num_punctuation	num_lowercase	num_title	clean_train
1022714	c86ab618f85e9b7fc374	Is sadism a coping mechanism for people who ar...	16	16	94	8	1	15	1	sadism coping mechanism people extremely affec...
641364	7d9ea6d66b8866e69240	Is it possible for me as a soul to go outside ...	18	18	84	11	1	17	1	possible soul go outside body observe universe
1225111	f01982d0cd06aba308ed	Do Pet Animal Rescue workers minimize shows of...	23	23	143	10	2	19	4	pet animal rescue workers minimize shows affec...
1130433	dd8a6b5452a407cea2ac	How do you identify a sonnet and what can we i...	15	14	71	11	1	14	1	identify sonnet infer suggest
1220402	ef30e73bf0a81a06ccf6	Is there a special place in hell for the likes...	13	13	63	7	1	10	3	special place hell likes genghiz khan

Quora Insincere Questions Classification¶

Dataset Link: Quora Insincere Questions Classification ¶

	qid	question_text	number_words	num_unique_words	num_chars	num_stopwords	num_punctuation	num_uppercase	num_lowercase	num_title	clean_test
0	0000163e3ea7c7a74cd7	Why do so many women become so rude and arroga...	21	19	101	11	1	0	20	1	many women become rude arrogant get little bit...
1	00002bd4fb5d505b9161	When should I apply for RV college of engineer...	30	23	162	17	2	6	22	5	apply rv college engineering bms college engin...
2	00007756b4a147d2b0b3	What is it really like to be a nurse practitio...	10	10	50	6	1	0	9	1	really like nurse practitioner
3	000086e4b7e1c7146103	Who are entrepreneurs?	3	3	22	2	1	0	2	1	entrepreneurs
4	0000c4c3fbe8785a3090	Is education really making good people nowadays?	7	7	48	1	1	0	6	1	education really making good people nowadays
...	...	...	...	...	...	...	...	...	...	...	...
375801	ffff7fa746bd6d6197a9	How many countries listed in gold import in in...	9	8	50	3	1	0	8	1	many countries listed gold import indua
375802	ffffa1be31c43046ab6b	Is there an alternative to dresses on formal p...	9	9	53	5	1	0	8	1	alternative dresses formal parties
375803	ffffae173b6ca6bfa563	Where I can find best friendship quotes in Tel...	9	9	50	4	1	1	6	3	find best friendship quotes telugu
375804	ffffb1f7f1a008620287	What are the causes of refraction of light?	8	7	43	5	1	0	7	1	causes refraction light
375805	fffff85473f4699474b0	Climate change is a worrying topic. How much t...	36	33	189	17	7	2	30	5	climate change worrying topic much time left f...

Quora Insincere Questions Classification¶

Dataset Link: Quora Insincere Questions Classification¶

Dataset Link: Quora Insincere Questions Classification ¶