{
"cells": [
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import metrics\n",
"from pandas_ml import ConfusionMatrix\n",
"from matplotlib import pyplot as plt\n",
"from sklearn.linear_model import PassiveAggressiveClassifier\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"D:/Term 2 class notes/Datathon/data/task2-version5.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Source.Name | \n",
" article_name | \n",
" line_number | \n",
" news_type | \n",
" news_text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" article111111112.task2.labels | \n",
" 111111112 | \n",
" 1 | \n",
" non-propaganda | \n",
" US bloggers banned from entering UK | \n",
"
\n",
" \n",
" 1 | \n",
" article111111112.task2.labels | \n",
" 111111112 | \n",
" 2 | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" article111111112.task2.labels | \n",
" 111111112 | \n",
" 3 | \n",
" non-propaganda | \n",
" Two prominent US bloggers have been banned fro... | \n",
"
\n",
" \n",
" 3 | \n",
" article111111112.task2.labels | \n",
" 111111112 | \n",
" 4 | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" article111111112.task2.labels | \n",
" 111111112 | \n",
" 5 | \n",
" propaganda | \n",
" Pamela Geller and Robert Spencer co-founded an... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Source.Name article_name line_number news_type \\\n",
"0 article111111112.task2.labels 111111112 1 non-propaganda \n",
"1 article111111112.task2.labels 111111112 2 non-propaganda \n",
"2 article111111112.task2.labels 111111112 3 non-propaganda \n",
"3 article111111112.task2.labels 111111112 4 non-propaganda \n",
"4 article111111112.task2.labels 111111112 5 propaganda \n",
"\n",
" news_text \n",
"0 US bloggers banned from entering UK \n",
"1 NaN \n",
"2 Two prominent US bloggers have been banned fro... \n",
"3 NaN \n",
"4 Pamela Geller and Robert Spencer co-founded an... "
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"non-propaganda 11230\n",
"propaganda 3940\n",
"Name: news_type, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['news_type'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(15170, 5)"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"df = df.set_index([\"article_name\",\"line_number\"])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" Source.Name | \n",
" news_type | \n",
" news_text | \n",
"
\n",
" \n",
" article_name | \n",
" line_number | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 111111112 | \n",
" 1 | \n",
" article111111112.task2.labels | \n",
" non-propaganda | \n",
" US bloggers banned from entering UK | \n",
"
\n",
" \n",
" 2 | \n",
" article111111112.task2.labels | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" article111111112.task2.labels | \n",
" non-propaganda | \n",
" Two prominent US bloggers have been banned fro... | \n",
"
\n",
" \n",
" 4 | \n",
" article111111112.task2.labels | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" article111111112.task2.labels | \n",
" propaganda | \n",
" Pamela Geller and Robert Spencer co-founded an... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Source.Name news_type \\\n",
"article_name line_number \n",
"111111112 1 article111111112.task2.labels non-propaganda \n",
" 2 article111111112.task2.labels non-propaganda \n",
" 3 article111111112.task2.labels non-propaganda \n",
" 4 article111111112.task2.labels non-propaganda \n",
" 5 article111111112.task2.labels propaganda \n",
"\n",
" news_text \n",
"article_name line_number \n",
"111111112 1 US bloggers banned from entering UK \n",
" 2 NaN \n",
" 3 Two prominent US bloggers have been banned fro... \n",
" 4 NaN \n",
" 5 Pamela Geller and Robert Spencer co-founded an... "
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"del df['Source.Name']"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" news_type | \n",
" news_text | \n",
"
\n",
" \n",
" article_name | \n",
" line_number | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 111111112 | \n",
" 1 | \n",
" non-propaganda | \n",
" US bloggers banned from entering UK | \n",
"
\n",
" \n",
" 2 | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" non-propaganda | \n",
" Two prominent US bloggers have been banned fro... | \n",
"
\n",
" \n",
" 4 | \n",
" non-propaganda | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" propaganda | \n",
" Pamela Geller and Robert Spencer co-founded an... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" news_type \\\n",
"article_name line_number \n",
"111111112 1 non-propaganda \n",
" 2 non-propaganda \n",
" 3 non-propaganda \n",
" 4 non-propaganda \n",
" 5 propaganda \n",
"\n",
" news_text \n",
"article_name line_number \n",
"111111112 1 US bloggers banned from entering UK \n",
" 2 NaN \n",
" 3 Two prominent US bloggers have been banned fro... \n",
" 4 NaN \n",
" 5 Pamela Geller and Robert Spencer co-founded an... "
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"news_type 0\n",
"news_text 907\n",
"dtype: int64"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"df=df.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"news_type 0\n",
"news_text 0\n",
"dtype: int64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" news_type | \n",
" news_text | \n",
"
\n",
" \n",
" article_name | \n",
" line_number | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 111111112 | \n",
" 1 | \n",
" non-propaganda | \n",
" US bloggers banned from entering UK | \n",
"
\n",
" \n",
" 3 | \n",
" non-propaganda | \n",
" Two prominent US bloggers have been banned fro... | \n",
"
\n",
" \n",
" 5 | \n",
" propaganda | \n",
" Pamela Geller and Robert Spencer co-founded an... | \n",
"
\n",
" \n",
" 7 | \n",
" non-propaganda | \n",
" They were due to speak at an English Defence L... | \n",
"
\n",
" \n",
" 9 | \n",
" non-propaganda | \n",
" A government spokesman said individuals whose ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" news_type \\\n",
"article_name line_number \n",
"111111112 1 non-propaganda \n",
" 3 non-propaganda \n",
" 5 propaganda \n",
" 7 non-propaganda \n",
" 9 non-propaganda \n",
"\n",
" news_text \n",
"article_name line_number \n",
"111111112 1 US bloggers banned from entering UK \n",
" 3 Two prominent US bloggers have been banned fro... \n",
" 5 Pamela Geller and Robert Spencer co-founded an... \n",
" 7 They were due to speak at an English Defence L... \n",
" 9 A government spokesman said individuals whose ... "
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = df.news_type # output variable\n",
"df.drop(\"news_type\", axis=1) ## droping output variable\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"article_name line_number\n",
"111111112 1 non-propaganda\n",
" 3 non-propaganda\n",
" 5 propaganda\n",
" 7 non-propaganda\n",
" 9 non-propaganda\n",
"Name: news_type, dtype: object"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.head()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# Make training and test sets \n",
"X_train, X_test, y_train, y_test = train_test_split(df['news_text'], y, test_size=0.33, random_state=53)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building Count Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# Initialize the `count_vectorizer` \n",
"count_vectorizer = CountVectorizer(stop_words='english')"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"count_train = count_vectorizer.fit(X_train)\n",
"count_test = count_vectorizer.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"count_train = count_vectorizer.transform(X_train)\n",
"# count_train"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building TF-IDF Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"# Initialize the `tfidf_vectorizer` \n",
"tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)\n",
"# wich will not appering in all the words. here max_df represent the words accuring <80 of documents are considering.the \n",
"\n",
"# Fit the training Data\n",
"tfidf_train = tfidf_vectorizer.fit_transform(X_train)\n",
"tfidf_train = tfidf_vectorizer.transform(X_train)\n",
"\n",
"#print(tfidf_train.vocabulary_)\n",
"#print(tfidf_train.idf_)\n",
"#tfidf_train.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"tfidf_train = tfidf_vectorizer.fit_transform(X_train)\n",
"tfidf_test = tfidf_vectorizer.transform(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building Hash Vector"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### Hashvector (its like count vectorizer but it wont generate vocabulary i.e dictionary of words it will \n",
"###give numbering we cant back transforming like count vectorizer)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"# Initialize the `hash_vectorizer` \n",
"hashing_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
" \" in version 0.21.\", DeprecationWarning)\n",
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
" \" in version 0.21.\", DeprecationWarning)\n",
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
" \" in version 0.21.\", DeprecationWarning)\n"
]
}
],
"source": [
"hash_train = hashing_vectorizer.fit(X_train)\n",
"hash_train = hashing_vectorizer.transform(X_train)\n",
"hash_test = hashing_vectorizer.transform(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building using Count vector"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.750\n",
"[[ 568 781]\n",
" [ 395 2963]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.79 0.88 0.83 3358\n",
" propaganda 0.59 0.42 0.49 1349\n",
"\n",
" avg / total 0.73 0.75 0.74 4707\n",
"\n"
]
}
],
"source": [
"## Fitting Naive Baye's Classifier for Multinomial Model\n",
"l_clf = MultinomialNB()\n",
"\n",
"l_clf.fit(count_train, y_train)\n",
"\n",
"pred = l_clf.predict(count_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.696\n",
"[[ 622 727]\n",
" [ 704 2654]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.78 0.79 0.79 3358\n",
" propaganda 0.47 0.46 0.47 1349\n",
"\n",
" avg / total 0.69 0.70 0.70 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
}
],
"source": [
"## Fitting Passive Aggresive Classifier Model\n",
"\n",
"linear_clf = PassiveAggressiveClassifier(n_iter=50)\n",
"\n",
"linear_clf.fit(count_train, y_train)\n",
"pred = linear_clf.predict(count_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.697\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.78 0.80 0.79 3358\n",
" propaganda 0.47 0.44 0.45 1349\n",
"\n",
" avg / total 0.69 0.70 0.69 4707\n",
"\n"
]
}
],
"source": [
"# fitting decision tree\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"Dt=DecisionTreeClassifier()\n",
"Dt.fit(count_train,y_train)\n",
"pred=Dt.predict(count_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.748\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.77 0.91 0.84 3358\n",
" propaganda 0.61 0.34 0.44 1349\n",
"\n",
" avg / total 0.73 0.75 0.72 4707\n",
"\n"
]
}
],
"source": [
"# fitting Logistic regression classifier\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"Lr=LogisticRegression()\n",
"Lr.fit(count_train,y_train)\n",
"pred=Lr.predict(count_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.727\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.76 0.90 0.82 3358\n",
" propaganda 0.54 0.30 0.39 1349\n",
"\n",
" avg / total 0.70 0.73 0.70 4707\n",
"\n"
]
}
],
"source": [
"# fitting Random FOrest classifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"Rf=RandomForestClassifier()\n",
"Rf.fit(count_train,y_train)\n",
"pred=Rf.predict(count_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.713\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.71 1.00 0.83 3358\n",
" propaganda 0.00 0.00 0.00 1349\n",
"\n",
" avg / total 0.51 0.71 0.59 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"# fitting Support vector Machines\n",
"\n",
"from sklearn.svm import SVC\n",
"\n",
"svm=SVC()\n",
"svm.fit(count_train,y_train)\n",
"pred=svm.predict(count_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building using Hash vector"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.714\n",
"[[ 2 1347]\n",
" [ 1 3357]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.71 1.00 0.83 3358\n",
" propaganda 0.67 0.00 0.00 1349\n",
"\n",
" avg / total 0.70 0.71 0.59 4707\n",
"\n"
]
}
],
"source": [
"## Fitting Naive Baye's Classifier for Multinomial Model\n",
"l_clf = MultinomialNB()\n",
"\n",
"l_clf.fit(hash_train, y_train)\n",
"\n",
"pred = l_clf.predict(hash_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.707\n",
"[[ 573 776]\n",
" [ 605 2753]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.78 0.82 0.80 3358\n",
" propaganda 0.49 0.42 0.45 1349\n",
"\n",
" avg / total 0.70 0.71 0.70 4707\n",
"\n"
]
}
],
"source": [
"## Fitting Passive Aggresive Classifier Model\n",
"\n",
"linear_clf = PassiveAggressiveClassifier(n_iter=50)\n",
"\n",
"linear_clf.fit(hash_train, y_train)\n",
"pred = linear_clf.predict(hash_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.699\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.77 0.83 0.80 3358\n",
" propaganda 0.47 0.38 0.42 1349\n",
"\n",
" avg / total 0.68 0.70 0.69 4707\n",
"\n"
]
}
],
"source": [
"# fitting decision tree\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"Dt=DecisionTreeClassifier()\n",
"Dt.fit(hash_train,y_train)\n",
"pred=Dt.predict(hash_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.736\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.74 0.98 0.84 3358\n",
" propaganda 0.72 0.13 0.22 1349\n",
"\n",
" avg / total 0.73 0.74 0.66 4707\n",
"\n"
]
}
],
"source": [
"# fitting Logistic regression classifier\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"Lr=LogisticRegression()\n",
"Lr.fit(hash_train,y_train)\n",
"pred=Lr.predict(hash_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.731\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.74 0.97 0.84 3358\n",
" propaganda 0.63 0.14 0.23 1349\n",
"\n",
" avg / total 0.71 0.73 0.66 4707\n",
"\n"
]
}
],
"source": [
"# fitting Random FOrest classifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"Rf=RandomForestClassifier()\n",
"Rf.fit(hash_train,y_train)\n",
"pred=Rf.predict(hash_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.713\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.71 1.00 0.83 3358\n",
" propaganda 0.00 0.00 0.00 1349\n",
"\n",
" avg / total 0.51 0.71 0.59 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"# fitting Support vector Machines\n",
"\n",
"from sklearn.svm import SVC\n",
"\n",
"svm=SVC()\n",
"svm.fit(hash_train,y_train)\n",
"pred=svm.predict(hash_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.713\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.72 0.99 0.83 3358\n",
" propaganda 0.50 0.03 0.06 1349\n",
"\n",
" avg / total 0.66 0.71 0.61 4707\n",
"\n"
]
}
],
"source": [
"# fitting KNN classifier\n",
"\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"knn=KNeighborsClassifier()\n",
"knn.fit(hash_train,y_train)\n",
"pred=knn.predict(hash_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building using Tf Idf Vector"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.727\n",
"[[ 73 1276]\n",
" [ 11 3347]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.72 1.00 0.84 3358\n",
" propaganda 0.87 0.05 0.10 1349\n",
"\n",
" avg / total 0.77 0.73 0.63 4707\n",
"\n"
]
}
],
"source": [
"## Fitting Naive Baye's Classifier for Multinomial Model\n",
"l_clf = MultinomialNB()\n",
"\n",
"l_clf.fit(tfidf_train, y_train)\n",
"\n",
"pred = l_clf.predict(tfidf_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.708\n",
"[[ 613 736]\n",
" [ 637 2721]]\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.79 0.81 0.80 3358\n",
" propaganda 0.49 0.45 0.47 1349\n",
"\n",
" avg / total 0.70 0.71 0.70 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
}
],
"source": [
"## Fitting Passive Aggresive Classifier Model\n",
"\n",
"linear_clf = PassiveAggressiveClassifier(n_iter=50)\n",
"\n",
"linear_clf.fit(tfidf_train, y_train)\n",
"pred = linear_clf.predict(tfidf_test)\n",
"score = metrics.accuracy_score(y_test, pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"cm = metrics.confusion_matrix(y_test, pred, labels=['propaganda', 'non-propaganda'])\n",
"print(cm)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.704\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.77 0.82 0.80 3358\n",
" propaganda 0.48 0.40 0.44 1349\n",
"\n",
" avg / total 0.69 0.70 0.70 4707\n",
"\n"
]
}
],
"source": [
"# fitting decision tree\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"Dt=DecisionTreeClassifier()\n",
"Dt.fit(tfidf_train,y_train)\n",
"pred=Dt.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.737\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.74 0.98 0.84 3358\n",
" propaganda 0.75 0.13 0.21 1349\n",
"\n",
" avg / total 0.74 0.74 0.66 4707\n",
"\n"
]
}
],
"source": [
"# fitting Logistic regression classifier\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"Lr=LogisticRegression()\n",
"Lr.fit(tfidf_train,y_train)\n",
"pred=Lr.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.731\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.75 0.94 0.83 3358\n",
" propaganda 0.59 0.20 0.30 1349\n",
"\n",
" avg / total 0.70 0.73 0.68 4707\n",
"\n"
]
}
],
"source": [
"# fitting Random FOrest classifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"Rf=RandomForestClassifier()\n",
"Rf.fit(tfidf_train,y_train)\n",
"pred=Rf.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.713\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.71 1.00 0.83 3358\n",
" propaganda 0.00 0.00 0.00 1349\n",
"\n",
" avg / total 0.51 0.71 0.59 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"# fitting Support vector Machines\n",
"\n",
"from sklearn.svm import SVC\n",
"\n",
"svm=SVC()\n",
"svm.fit(tfidf_train,y_train)\n",
"pred=svm.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.712\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.72 0.99 0.83 3358\n",
" propaganda 0.47 0.03 0.05 1349\n",
"\n",
" avg / total 0.65 0.71 0.61 4707\n",
"\n"
]
}
],
"source": [
"# fitting KNN classifier\n",
"\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"knn=KNeighborsClassifier()\n",
"knn.fit(tfidf_train,y_train)\n",
"pred=knn.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: xgboost in c:\\users\\hitman\\anaconda3\\lib\\site-packages (0.81)\n",
"Requirement already satisfied: scipy in c:\\users\\hitman\\anaconda3\\lib\\site-packages (from xgboost) (1.1.0)\n",
"Requirement already satisfied: numpy in c:\\users\\hitman\\anaconda3\\lib\\site-packages (from xgboost) (1.14.3)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.723\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.72 1.00 0.84 3358\n",
" propaganda 0.79 0.05 0.09 1349\n",
"\n",
" avg / total 0.74 0.72 0.62 4707\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n"
]
}
],
"source": [
"# fitting Xg boost classifier\n",
"!pip install xgboost\n",
"from xgboost.sklearn import XGBClassifier\n",
"\n",
"Xg = XGBClassifier()\n",
"Xg.fit(tfidf_train,y_train)\n",
"pred=Xg.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test,pred))"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.702\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.78 0.81 0.79 3358\n",
" propaganda 0.48 0.44 0.46 1349\n",
"\n",
" avg / total 0.69 0.70 0.70 4707\n",
"\n",
"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
" hidden_layer_sizes=30, learning_rate='constant',\n",
" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
" nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
" shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
" verbose=False, warm_start=False)\n"
]
}
],
"source": [
"from sklearn.neural_network import MLPClassifier\n",
"\n",
"mlp=MLPClassifier(hidden_layer_sizes=(30))\n",
"mlp.fit(tfidf_train,y_train)\n",
"pred=mlp.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"\n",
"from sklearn.metrics import classification_report\n",
"\n",
"print(classification_report(y_test,pred))\n",
"print(mlp)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.707\n",
" precision recall f1-score support\n",
"\n",
"non-propaganda 0.79 0.80 0.80 3358\n",
" propaganda 0.49 0.46 0.48 1349\n",
"\n",
" avg / total 0.70 0.71 0.71 4707\n",
"\n",
"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
" hidden_layer_sizes=1, learning_rate='constant',\n",
" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
" nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
" shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
" verbose=False, warm_start=False)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Hitman\\Anaconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
}
],
"source": [
"from sklearn.neural_network import MLPClassifier\n",
"\n",
"mlp=MLPClassifier(hidden_layer_sizes=(1))\n",
"mlp.fit(tfidf_train,y_train)\n",
"pred=mlp.predict(tfidf_test)\n",
"score=metrics.accuracy_score(y_test,pred)\n",
"print(\"accuracy: %0.3f\" % score)\n",
"\n",
"from sklearn.metrics import classification_report\n",
"\n",
"print(classification_report(y_test,pred))\n",
"print(mlp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}