diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..42eb591 --- /dev/null +++ b/.gitignore @@ -0,0 +1,211 @@ +# Указанные папки +dataset/ +.venv/ +.vscode/ +imgs/ +venv/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +Pipfile.lock + +# PEP 582 +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# macOS +.DS_Store +.AppleDouble +.LSOverride +Icon +._* +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# Windows +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db +*.tmp +*.temp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders +.target/ +.metadata +.factorypath +.buildpath +.classpath +.project +.externalToolBuilders/ +*.launch +.pydevproject +.cproject +.autotools +.factorypath +.buildpath +.target/ +.tern-project +.idea/ +*.sublime-workspace +*.sublime-project + +# Linux +*~ +.fuse_hidden* +.directory +.Trash-* +.nfs* + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Data files +*.csv +*.json +*.pkl +*.pickle +*.h5 +*.hdf5 +*.parquet +*.feather +*.xlsx +*.xls +*.numbers + +# Model files +*.model +*.joblib +*.pkl +*.h5 +*.hdf5 +*.onnx +*.pb +*.tflite + +# Logs +*.log +logs/ + +# Temporary files +tmp/ +temp/ +.tmp/ +.temp/ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db diff --git a/1. data_exploration.ipynb b/1. data_exploration.ipynb new file mode 100644 index 0000000..606fca7 --- /dev/null +++ b/1. data_exploration.ipynb @@ -0,0 +1,1713 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Natural Language Processing Challenge - Data Exploration\n", + "\n", + "## Project Overview\n", + "\n", + "This notebook explores a dataset containing news articles to identify whether a news headline is real or fake news.\n", + "\n", + "### Dataset Information\n", + "- **Training Data**: `dataset/data.csv` (39,998 articles)\n", + "- **Validation Data**: `dataset/validation_data.csv` (4,956 articles)\n", + "\n", + "### Columns\n", + "- **`label`**: 0 if fake news, 1 if real news\n", + "- **`title`**: The headline of the news article\n", + "- **`text`**: The full content of the article\n", + "- **`subject`**: The category or topic of the news\n", + "- **`date`**: The publication date of the article\n", + "\n", + "### Goal\n", + "Build a classifier to distinguish between real and fake news, then predict labels for the validation dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All libraries imported successfully!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /Users/sergej/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "# Standard data science libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Text processing libraries\n", + "import re\n", + "import string\n", + "from collections import Counter\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# Machine learning libraries\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Visualization settings - using basic matplotlib to avoid seaborn compatibility issues\n", + "plt.style.use('default')\n", + "plt.rcParams['figure.figsize'] = (12, 8)\n", + "\n", + "# Download required NLTK data\n", + "try:\n", + " nltk.data.find('tokenizers/punkt')\n", + "except LookupError:\n", + " nltk.download('punkt')\n", + " \n", + "try:\n", + " nltk.data.find('corpora/stopwords')\n", + "except LookupError:\n", + " nltk.download('stopwords')\n", + " \n", + "try:\n", + " nltk.data.find('corpora/wordnet')\n", + "except LookupError:\n", + " nltk.download('wordnet')\n", + "\n", + "print(\"All libraries imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load and Explore the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading training data...\n", + "Training data shape: (39942, 5)\n", + "\n", + "Loading validation data...\n", + "Validation data shape: (4956, 5)\n", + "\n", + "✅ Data loaded successfully!\n" + ] + } + ], + "source": [ + "# Load the training data\n", + "print(\"Loading training data...\")\n", + "train_data = pd.read_csv('dataset/data.csv')\n", + "print(f\"Training data shape: {train_data.shape}\")\n", + "\n", + "# Load the validation data\n", + "print(\"\\nLoading validation data...\")\n", + "validation_data = pd.read_csv('dataset/validation_data.csv')\n", + "print(f\"Validation data shape: {validation_data.shape}\")\n", + "\n", + "print(\"\\n✅ Data loaded successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TRAINING DATA INFO ===\n", + "\n", + "RangeIndex: 39942 entries, 0 to 39941\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 label 39942 non-null int64 \n", + " 1 title 39942 non-null object\n", + " 2 text 39942 non-null object\n", + " 3 subject 39942 non-null object\n", + " 4 date 39942 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 1.5+ MB\n", + "None\n", + "\n", + "=== VALIDATION DATA INFO ===\n", + "\n", + "RangeIndex: 4956 entries, 0 to 4955\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 label 4956 non-null int64 \n", + " 1 title 4956 non-null object\n", + " 2 text 4956 non-null object\n", + " 3 subject 4956 non-null object\n", + " 4 date 4956 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 193.7+ KB\n", + "None\n" + ] + } + ], + "source": [ + "# Display basic information about the datasets\n", + "print(\"=== TRAINING DATA INFO ===\")\n", + "print(train_data.info())\n", + "print(\"\\n=== VALIDATION DATA INFO ===\")\n", + "print(validation_data.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TRAINING DATA PREVIEW ===\n", + " label title \\\n", + "0 1 As U.S. budget fight looms, Republicans flip t... \n", + "1 1 U.S. military to accept transgender recruits o... \n", + "2 1 Senior U.S. Republican senator: 'Let Mr. Muell... \n", + "3 1 FBI Russia probe helped by Australian diplomat... \n", + "4 1 Trump wants Postal Service to charge 'much mor... \n", + "\n", + " text subject \\\n", + "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n", + "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n", + "2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n", + "3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews \n", + "4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews \n", + "\n", + " date \n", + "0 December 31, 2017 \n", + "1 December 29, 2017 \n", + "2 December 31, 2017 \n", + "3 December 30, 2017 \n", + "4 December 29, 2017 \n", + "\n", + "=== VALIDATION DATA PREVIEW ===\n", + " label title \\\n", + "0 2 UK's May 'receiving regular updates' on London... \n", + "1 2 UK transport police leading investigation of L... \n", + "2 2 Pacific nations crack down on North Korean shi... \n", + "3 2 Three suspected al Qaeda militants killed in Y... \n", + "4 2 Chinese academics prod Beijing to consider Nor... \n", + "\n", + " text subject \\\n", + "0 LONDON (Reuters) - British Prime Minister Ther... worldnews \n", + "1 LONDON (Reuters) - British counter-terrorism p... worldnews \n", + "2 WELLINGTON (Reuters) - South Pacific island na... worldnews \n", + "3 ADEN, Yemen (Reuters) - Three suspected al Qae... worldnews \n", + "4 BEIJING (Reuters) - Chinese academics are publ... worldnews \n", + "\n", + " date \n", + "0 September 15, 2017 \n", + "1 September 15, 2017 \n", + "2 September 15, 2017 \n", + "3 September 15, 2017 \n", + "4 September 15, 2017 \n" + ] + } + ], + "source": [ + "# Display first few rows of training data\n", + "print(\"=== TRAINING DATA PREVIEW ===\")\n", + "print(train_data.head())\n", + "print(\"\\n=== VALIDATION DATA PREVIEW ===\")\n", + "print(validation_data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== MISSING VALUES IN TRAINING DATA ===\n", + "label 0\n", + "title 0\n", + "text 0\n", + "subject 0\n", + "date 0\n", + "dtype: int64\n", + "\n", + "=== MISSING VALUES IN VALIDATION DATA ===\n", + "label 0\n", + "title 0\n", + "text 0\n", + "subject 0\n", + "date 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Check for missing values\n", + "print(\"=== MISSING VALUES IN TRAINING DATA ===\")\n", + "print(train_data.isnull().sum())\n", + "print(\"\\n=== MISSING VALUES IN VALIDATION DATA ===\")\n", + "print(validation_data.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TRAINING DATA TYPES ===\n", + "label int64\n", + "title object\n", + "text object\n", + "subject object\n", + "date object\n", + "dtype: object\n", + "\n", + "=== VALIDATION DATA TYPES ===\n", + "label int64\n", + "title object\n", + "text object\n", + "subject object\n", + "date object\n", + "dtype: object\n" + ] + } + ], + "source": [ + "# Check data types\n", + "print(\"=== TRAINING DATA TYPES ===\")\n", + "print(train_data.dtypes)\n", + "print(\"\\n=== VALIDATION DATA TYPES ===\")\n", + "print(validation_data.dtypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Data Distribution Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== LABEL DISTRIBUTION ===\n", + "Training data:\n", + "label\n", + "1 19999\n", + "0 19943\n", + "Name: count, dtype: int64\n", + "\n", + "Fake news percentage: 49.93%\n", + "Real news percentage: 50.07%\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA90AAAJOCAYAAACqS2TfAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAASK5JREFUeJzt3QeUVeW9N+B3EAEbWJAWEWyxIigqYmLhSsASExRv7GKPBntiIVY0NxiMNaJcExWTWLmfErsiikbBhkFExc+CXcBYQFGp863/u+453wxNymyHGZ5nrb3O7L3fs8/e+xw8/s7bKiorKysTAAAAUOMa1PwhAQAAgCB0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDUKh33nknVVRUpD/+8Y81dsyRI0fmY8ZjTbvwwgvzsb8Pu+22W17mva7/+Z//+V5e/4gjjkjt27dPtfWZGDJkSKpL4pzj87E04j7H/QZgxSN0AzCfCEMRMF544YVUH66jtDRp0iS1adMm9ezZM1199dXpyy+/rJHX+eijj3IYGzt2bFreLM/nVsR7vLClNn5cWF5UvQ8NGzZMa6+9durcuXM65ZRT0quvvrrUx/3666/zZ6uIH78A6pOGtX0CAFC0iy66KG2wwQZp1qxZadKkSTkknHrqqenyyy9P99xzT9p6663LZc8999x09tlnL3Gw7d+/fw52nTp1WuznPfLII6loizq3P//5z2nu3Lnp+9auXbv0zTffpJVXXnmZj7XLLrukv/3tb9W2HXPMMWmHHXZIxx13XHnb6quvvsyvFeccoXVpvP7666lBg9qr6/jJT36SDj/88FRZWZmmTp2aXnrppXTzzTena6+9Nv3hD39Ip59++lKF7vhshaotNgCoTugGoN7bc88903bbbVde79evX3rsscfST3/60/Szn/0svfbaa2mVVVbJ+yJULW2wWpKwsuqqq6ZGjRql2lQToXdplFod1IQNN9wwL1Udf/zxeduhhx660OfNnj07/+CwJO/Bspxz48aNU2364Q9/ON/9uOSSS9I+++yTfv3rX6fNNtss7bXXXrV2fgD1meblACyVmTNnpvPPPz83U23WrFlabbXV0s4775wef/zxhT7niiuuyLWcEXB33XXXNH78+PnKTJgwIe2///65CWyEnAjLURtd0/7jP/4jnXfeeendd99Nf//73xfZp3v48OHpxz/+cVpzzTVzjemmm26afvvb3+Z9UWu+/fbb57+PPPLIcjPeUn/lqAHcaqut0pgxY3KtbITt0nPn7dNdMmfOnFymVatW+b7GDwPvv//+YvURrnrM7zq3BfXpnj59eg5hbdu2zUExrjX640cNaVVxnBNPPDENGzYsX1+U3XLLLdNDDz20VH2641zi3n744YepV69e+e911103/eY3v8n3o6bGFbjyyivTRhttlM83mlYvyed43j7dpc/Km2++mc8/Ph9xjLjX8cPKot6vUrP4p59+Otcyx7XGa++7777pk08+qfbc+HEgXiu6RsTnp1u3bvncl7Wf+DrrrJNuv/32/CPTf/3Xf5W3L849iXsa5xyitrv02Srdn3HjxuVzix8/4t9xfJaPOuqo9Omnny71+QLUVWq6AVgq06ZNS3/5y1/SQQcdlI499tjcP/qGG27I/aWfe+65+Zoy//Wvf81l+vbtm7799tt01VVX5eD78ssvp5YtW+Yyr7zySvrRj36UfvCDH+Qm3vE/+3feeWcOYf/n//yfHEhq0mGHHZbDbTTzjmtYkDinqBGPJujRTD3CWoSsCEth8803z9sjpERz5ggnYaeddiofI4JG1LYfeOCBubaxdL0LEwEoAsxZZ52VpkyZkoNi9+7dc7/sUo384licc6sqgnUE/AhXRx99dH4PH3744XTGGWfkMBw/mlT11FNPpbvuuiv96le/SmussUbuJ9+7d+/03nvv5UC3pCJcx+enS5cuOSA/+uij6bLLLssh+YQTTkjL6qabbsqfvbgX8T7GDztL+jlekF/84he5+8KAAQPSiy++mI/XokWL3Gz7u5x00klprbXWShdccEEOsvFex48Zd9xxR7WWGQMHDsy10nFe0TQ8HuNaltX666+ffwCL9zzuRdOmTRfrnkTgvu666/L7Ev8u99tvv3y8UleN+KHq7bffzj9AROCOf0fXX399fnzmmWe+t8EKAZYLlQAwj5tuuimqNSuff/75hZaZPXt25YwZM6pt+/zzzytbtmxZedRRR5W3TZw4MR9rlVVWqfzggw/K25999tm8/bTTTitv23333Ss7dOhQ+e2335a3zZ07t3KnnXaq3GSTTcrbHn/88fzceFzW62jWrFnlNttsU16/4IIL8nNKrrjiirz+ySefLPQYcfwoE683r1133TXvGzx48AL3xTLvdf3gBz+onDZtWnn7nXfembdfddVV5W3t2rWr7NOnz3cec1HnFs+P45QMGzYsl/3d735Xrdz+++9fWVFRUfnmm2+Wt0W5Ro0aVdv20ksv5e1/+tOfKhel9Jmoek5xLrHtoosuqlY23pvOnTtXLonVVlut2r0pvV7Tpk0rp0yZslSf49I1x+dj3s/KvOX23XffynXWWafatnnfr9Jns3v37vkzXhL/HlZaaaXKL774Iq9PmjSpsmHDhpW9evWqdrwLL7wwP39Bn4F5Rbm+ffsudP8pp5ySy8T7tyT3JP5NzHtPSr7++uv5tt122225/JNPPvmd5wxQn2heDsBSWWmllcr9YaP562effZb7yUZz8Kjtm1fUVkcNdkkMdBU1mg888EBej+dHP+uoNYyatX//+995iVriqGF74403cm1rTYtmzIsaxTyaDId//OMfSz3oWNSqRo3f4ooBr6LmuCSa27du3bp8r4oSx4/39eSTT662PZqbR3Z78MEHq22P2veohS6JWs6oKY0azqUV/bGritr5ZTleVVELX2oSvbSf48U95/jcRo3xd4la96q1vvHcqPGPbg9hxIgR+XyiNcG8NeQ1pTTIXOnfQU3ck6otMqJGPv4t77jjjnl9cY8BUF8I3QAstRj9OIJW9NmM5sQRaO6///48OvK8NtlkkwUO7hRNakM02Y5gF/2s4zhVl2h6G6KpdU376quvqgXceR1wwAG5yXuMiB3NwqOJeDR5X5IAHj82LMmAXfPeqwhlG2+8cfleFSWCXvQbnvd+RDP10v55mybPK5pKf/7550v1+vE5mjcUL8vx5hVNwJf1c7wg896HOOewOOf9Xc8t3fN4/6uKpvGlsjXxbyBUfd+X9Z5EUI8pyeLfTATweH7p/i/uMQDqC326AVgqMfhYDJQUNdjR5zf6sEYNWfRrfeutt5b4eKUQGwNnRc32gswbPJbVBx98kAPAoo4bgeHJJ5/MfV4jdMRAYdHfNvqjR1/wuObvsiT9sBfXwvrERi3p4pxTTVjY68w76NqyHq+mLOh9qInP8bLch5q+h0sjBjSM8yiF4pq4J9FiZdSoUfn50Qc8atPj3/gee+xRK9PUAdQmoRuApfI///M/eWTiGEiragAs1UrPK5qHz+v//t//Wx49uzTtU0xjFc2Wvw+l+Z0XFvJLYn7l3XffPS8xt/fvf//7dM455+QgHuda04NCzXuvIoBFS4Cq84lHLecXX3wx33OjZrTqFFpLcm4xsnwMXhbNjKvWesaI8qX9K/rn+PtWuufx/letqY/m6zXRAiAGvXviiSdS165dy+/54t6ThX224ryiWXyMah6D+C3qvwEAKwLNywFYKqUauqo1cs8++2waPXr0AsvH1FJV+2THKMhRPkb1DlGbFlNd/fd//3f6+OOP53v+vNMoLavoP37xxRfnIHPIIYcsspnsvEojWs+YMSM/xijrYUEheGmURnoviRAU96R0r0L0pY5RoGN6p5L77rtvvqnFluTcYp7mqCm/5pprqm2PUcsjYFV9/RX1c/x9ix96YkqvGCm8qnnfo6URn+0YoTze8/gRaUnvSUxftqDP1oKeH2JkdoAVkZpuABbqxhtvXOC8y9FXM6bRipqwmC5o7733ThMnTkyDBw9OW2yxRbmPaFXRhDvmuo4phiKsxv+AR1/RM888s1xm0KBBuUyHDh3yVEVR2zZ58uT8P/vRFDymSloaMQBY1NbGYFBxvAjcMaVR1CLGHODRb3VhYsqtaF4e1xjlo1/5tddem9Zbb718rqUAHAOuxfVHbWEE3RgkbmF9iL9L9NeNY8fga3G+ca/i/lWd1iz6mEcYj+a60ZQ3mv1Gs+CqA5st6bnFlFQxB3QEsOg/3rFjx9yEPgaRO/XUU+c7dn2wpJ/j71v0iY5/bzF1WkznFu93/DuIz3Tz5s0XuyVDtCqJz0cE4RjgLY4xdOjQfI3ReiOOu6T3JJrrx7bobhHjM8TnNuZsjyXmpI9pzmbNmpXHNIjPURwHYEUkdAOwUPPWrpVEf89YJk2alGumYy7n+J/v+J/6+B/5kSNHLnBE7mimHQEygmuMXh61dTEqd0kc44UXXsjNUocMGZKb0EYN+DbbbFOtmeqSKj03BjOLYBChPs4jQu2iBlELEXQigMYPEDECcwSdmNc4zrFZs2blJvEx8FTMpxwjWUe4jzmhlzZ0x9zh48aNy31oo8Y7ajsj6JdqFktN4iOIRWCKQBwjS0dNd4w0XtWSnFu8P/EjRNyvCFJRLpr/X3rppfMdt75Y0s9xbYj5vuO9//Of/5yb/0dT8Aix8cPMon4wqip+ZIol3uMYYT7e/z59+uTR0+Oal/aexHzeMZL6aaedlltdRBP0CN233npr3h4/pEXQ79GjR/6hIAbqA1jRVMS8YbV9EgAALL5o0h39+n/3u99VaxoOwPJHn24AgOXYN998M9+2Uv/oGAcBgOWb5uUAAMuxaOof3S1ioLuYeuupp55Kt912W26yHXPIA7B8E7oBAJZjMVVcjGAeA5PFIGilwdWiaTkAyz99ugEAAKAg+nQDAABAQYRuAAAAKIg+3TVk7ty56aOPPsrzvVZUVNT26QAAAFCg6Kn95ZdfpjZt2qQGDRZeny1015AI3G3btq3t0wAAAOB79P7776f11ltvofuF7hoSNdylG960adPaPh0AAAAKFDNKRMVrKQsujNBdQ0pNyiNwC90AAAArhu/qXmwgNQAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAACoj6F7wIABafvtt09rrLFGatGiRerVq1d6/fXXq5X59ttvU9++fdM666yTVl999dS7d+80efLkamXee++9tPfee6dVV101H+eMM85Is2fPrlZm5MiRadttt02NGzdOG2+8cRoyZMh85zNo0KDUvn371KRJk9SlS5f03HPPFXTlAAAArAhqNXQ/8cQTOVA/88wzafjw4WnWrFmpR48eafr06eUyp512Wrr33nvT0KFDc/mPPvoo7bfffuX9c+bMyYF75syZadSoUenmm2/Ogfr8888vl5k4cWIu061btzR27Nh06qmnpmOOOSY9/PDD5TJ33HFHOv3009MFF1yQXnzxxdSxY8fUs2fPNGXKlO/xjgAAAFCfVFRWVlam5cQnn3ySa6ojXO+yyy5p6tSpad1110233npr2n///XOZCRMmpM033zyNHj067bjjjunBBx9MP/3pT3MYb9myZS4zePDgdNZZZ+XjNWrUKP99//33p/Hjx5df68ADD0xffPFFeuihh/J61GxHrfs111yT1+fOnZvatm2bTjrppHT22Wd/57lPmzYtNWvWLJ9z06ZNC7pDAAAALA8WNwMuV32642TD2muvnR/HjBmTa7+7d+9eLrPZZpul9ddfP4fuEI8dOnQoB+4QNdRxA1555ZVymarHKJUpHSNqyeO1qpZp0KBBXi+VmdeMGTPya1RdAAAAoKqGaTkRNcvR7PtHP/pR2mqrrfK2SZMm5ZrqNddcs1rZCNixr1SmauAu7S/tW1SZCMrffPNN+vzzz3Mz9QWViZr1hfVH79+//zJfN8uRioraPgNYcS0/ja4AAGrUclPTHX27o/n37bffnuqCfv365Zr50vL+++/X9ikBAACwnFkuarpPPPHEdN9996Unn3wyrbfeeuXtrVq1yk2/o+911druGL089pXKzDvKeGl086pl5h3xPNaj3f0qq6ySVlpppbwsqEzpGPOKUdBjAQCo6yr6a+0FtaXyAq296rtaremOMdwicN99993pscceSxtssEG1/Z07d04rr7xyGjFiRHlbTCkWU4R17do1r8fjyy+/XG2U8RgJPQL1FltsUS5T9RilMqVjRBP2eK2qZaK5e6yXygAAAECdqumOJuUxMvk//vGPPFd3qQ92jAAXNdDxePTRR+epvGJwtQjSMZp4BOEYuTzEFGMRrg877LA0cODAfIxzzz03H7tUE3388cfnUcnPPPPMdNRRR+WAf+edd+YRzUviNfr06ZO22267tMMOO6Qrr7wyT1125JFH1tLdAQAAoK6r1dB93XXX5cfddtut2vabbropHXHEEfnvK664Io8k3rt37zxieIw6fu2115bLRrPwaJp+wgkn5DC+2mqr5fB80UUXlctEDXoE7Jjz+6qrrspN2P/yl7/kY5UccMABeYqxmN87gnunTp3ydGLzDq4GAAAAdXKe7rrMPN31gNHLofb4KoJapU831B59uuuuOjlPNwAAANQnQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKA+hu4nn3wy7bPPPqlNmzapoqIiDRs2rNr+2Lag5dJLLy2Xad++/Xz7L7nkkmrHGTduXNp5551TkyZNUtu2bdPAgQPnO5ehQ4emzTbbLJfp0KFDeuCBBwq8cgAAAFYEtRq6p0+fnjp27JgGDRq0wP0ff/xxteXGG2/Mobp3797Vyl100UXVyp100knlfdOmTUs9evRI7dq1S2PGjMmB/cILL0zXX399ucyoUaPSQQcdlI4++uj0r3/9K/Xq1Ssv48ePL/DqAQAAqO8a1uaL77nnnnlZmFatWlVb/8c//pG6deuWNtxww2rb11hjjfnKltxyyy1p5syZObA3atQobbnllmns2LHp8ssvT8cdd1wuc9VVV6U99tgjnXHGGXn94osvTsOHD0/XXHNNGjx4cA1cKQAAACuiOtOne/Lkyen+++/PtdHziubk66yzTtpmm21yTfbs2bPL+0aPHp122WWXHLhLevbsmV5//fX0+eefl8t079692jGjTGxfmBkzZuRa9KoLAAAALDc13Uvi5ptvzjXa++23X7XtJ598ctp2223T2muvnZuJ9+vXLzcxj5rsMGnSpLTBBhtUe07Lli3L+9Zaa638WNpWtUxsX5gBAwak/v371+AVAgAAUN/UmdAdzcMPOeSQPNBZVaeffnr576233jrXaP/yl7/Mobhx48aFnU+E+6qvHTXdMUgbAAAA1KnQ/c9//jM3B7/jjju+s2yXLl1y8/J33nknbbrpprmvdzRNr6q0XuoHvrAyC+snHiLQFxnqAQAAqPvqRJ/uG264IXXu3DmPdP5dYpC0Bg0apBYtWuT1rl275qnJZs2aVS4Tg6RFII+m5aUyI0aMqHacKBPbAQAAoE6G7q+++iqH5FjCxIkT89/vvfdetWbbMYf2McccM9/zY6CzK6+8Mr300kvp7bffziOVn3baaenQQw8tB+qDDz44NzmPAdheeeWVXFseo5VXbRp+yimnpIceeihddtllacKECXlKsRdeeCGdeOKJ38t9AAAAoH6q1eblEWxjCrCSUhDu06dPGjJkSP779ttvT5WVlXke7XlF8+7YHyE5RhOPAdMidFcN1M2aNUuPPPJI6tu3b64tb968eTr//PPL04WFnXbaKd16663p3HPPTb/97W/TJptskoYNG5a22mqrgu8AAAAA9VlFZSRallnUyEfAnzp1amratGltnw5Lo6Kits8AVly+iqBWVfT3HQi1pfIC34H1PQPWiT7dAAAAUBcJ3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgPoYup988sm0zz77pDZt2qSKioo0bNiwavuPOOKIvL3qsscee1Qr89lnn6VDDjkkNW3aNK255prp6KOPTl999VW1MuPGjUs777xzatKkSWrbtm0aOHDgfOcydOjQtNlmm+UyHTp0SA888EBBVw0AAMCKolZD9/Tp01PHjh3ToEGDFlomQvbHH39cXm677bZq+yNwv/LKK2n48OHpvvvuy0H+uOOOK++fNm1a6tGjR2rXrl0aM2ZMuvTSS9OFF16Yrr/++nKZUaNGpYMOOigH9n/961+pV69eeRk/fnxBVw4AAMCKoKKysrIyLQeiFvvuu+/OYbdqTfcXX3wxXw14yWuvvZa22GKL9Pzzz6ftttsub3vooYfSXnvtlT744INcg37dddelc845J02aNCk1atQolzn77LPzMSdMmJDXDzjggPwDQIT2kh133DF16tQpDR48eLHOP8J9s2bN0tSpU3OtO3VQRUVtnwGsuJaPryJYYVX09x0ItaXyAt+BddXiZsDlvk/3yJEjU4sWLdKmm26aTjjhhPTpp5+W940ePTo3KS8F7tC9e/fUoEGD9Oyzz5bL7LLLLuXAHXr27Jlef/319Pnnn5fLxPOqijKxfWFmzJiRb3LVBQAAAOpM6I6m5X/961/TiBEj0h/+8If0xBNPpD333DPNmTMn74/a6wjkVTVs2DCtvfbaeV+pTMuWLauVKa1/V5nS/gUZMGBA/lWjtERfcQAAAKiqYVqOHXjggeW/Y3CzrbfeOm200Ua59nv33Xev1XPr169fOv3008vrUdMteAMAAFBnarrnteGGG6bmzZunN998M6+3atUqTZkypVqZ2bNn5xHNY1+pzOTJk6uVKa1/V5nS/gVp3LhxbrdfdQEAAIA6G7pjcLTo0926deu83rVr1zzQWoxKXvLYY4+luXPnpi5dupTLxIjms2bNKpeJkc6jj/haa61VLhNN2KuKMrEdAAAA6mTojvm0x44dm5cwceLE/Pd7772X951xxhnpmWeeSe+8804OxT//+c/TxhtvnAc5C5tvvnnu933sscem5557Lj399NPpxBNPzM3SY+TycPDBB+dB1GI6sJha7I477khXXXVVtabhp5xySh71/LLLLssjmseUYi+88EI+FgAAANTJKcOib3a3bt3m296nT5881VdMHxbzZkdtdoTomG/74osvrjboWTQlj3B877335lHLe/funa6++uq0+uqrl8uMGzcu9e3bN08tFs3TTzrppHTWWWdVe82hQ4emc889Nwf8TTbZJA0cODBPPba4TBlWD5gyDGqPKcOgVpkyDGqPKcPqrsXNgMvNPN11ndBdDwjdUHt8FUGtErqh9gjddVe9macbAAAA6iqhGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUB9D95NPPpn22Wef1KZNm1RRUZGGDRtW3jdr1qx01llnpQ4dOqTVVlstlzn88MPTRx99VO0Y7du3z8+tulxyySXVyowbNy7tvPPOqUmTJqlt27Zp4MCB853L0KFD02abbZbLxGs+8MADBV45AAAAK4JaDd3Tp09PHTt2TIMGDZpv39dff51efPHFdN555+XHu+66K73++uvpZz/72XxlL7roovTxxx+Xl5NOOqm8b9q0aalHjx6pXbt2acyYMenSSy9NF154Ybr++uvLZUaNGpUOOuigdPTRR6d//etfqVevXnkZP358gVcPAABAfVdRWVlZmZYDUUN9991357C7MM8//3zaYYcd0rvvvpvWX3/9ck33qaeempcFue6669I555yTJk2alBo1apS3nX322blWfcKECXn9gAMOyD8A3HfffeXn7bjjjqlTp05p8ODBi3X+Ee6bNWuWpk6dmpo2bbpE185yoqKits8AVlzLx1cRrLAq+vsOhNpSeYHvwLpqcTNgnerTHRcT4XzNNdestj2ak6+zzjppm222yTXZs2fPLu8bPXp02mWXXcqBO/Ts2TPXmn/++eflMt27d692zCgT2xdmxowZ+SZXXQAAAKCqhqmO+Pbbb3Mf72gGXvVXhJNPPjltu+22ae21187NxPv165ebmF9++eV5f9Rwb7DBBtWO1bJly/K+tdZaKz+WtlUtE9sXZsCAAal///41fJUAAADUJ3UidMegar/4xS9StISP5uJVnX766eW/t95661yj/ctf/jKH4saNGxd2ThHuq7521HTHIG0AAABQZ0J3KXBHP+7HHnvsO/tLd+nSJTcvf+edd9Kmm26aWrVqlSZPnlytTGk99pUeF1SmtH9BItAXGeoBAACo+xrUhcD9xhtvpEcffTT32/4uY8eOTQ0aNEgtWrTI6127ds1Tk8WxSoYPH54DeTQtL5UZMWJEteNEmdgOAAAAdbKm+6uvvkpvvvlmeX3ixIk5NEf/7NatW6f9998/TxcWo4rPmTOn3Mc69kcz8hjo7Nlnn03dunVLa6yxRl4/7bTT0qGHHloO1AcffHDuex3TgUWf8JgG7KqrrkpXXHFF+XVPOeWUtOuuu6bLLrss7b333un2229PL7zwQrVpxQAAAKBOTRk2cuTIHJjn1adPnzyX9rwDoJU8/vjjabfddsuB/Fe/+lWe+itGE4/yhx12WO5rXbXp97hx41Lfvn3zlGPNmzfP83hHAK9q6NCh6dxzz83N0jfZZJM0cODAtNdeey32tZgyrB4wZRjUHlOGQa0yZRjUHlOG1V2LmwGXm3m66zqhux4QuqH2+CqCWiV0Q+0RuuuuejlPNwAAANQlQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAWJ5C94Ybbpg+/fTT+bZ/8cUXeR8AAACwlKH7nXfeSXPmzJlv+4wZM9KHH35YE+cFAAAAdV7DJSl8zz33lP9++OGHU7NmzcrrEcJHjBiR2rdvX7NnCAAAACtC6O7Vq1d+rKioSH369Km2b+WVV86B+7LLLqvZMwQAAIAVIXTPnTs3P26wwQbp+eefT82bNy/qvAAAAGDFCt0lEydOrPkzAQAAgHpmqUJ3iP7bsUyZMqVcA15y44031sS5AQAAwIoXuvv3758uuuiitN1226XWrVvnPt4AAABADYTuwYMHpyFDhqTDDjtsaZ4OAAAAK4Slmqd75syZaaeddqr5swEAAIAVPXQfc8wx6dZbb635swEAAIAVvXn5t99+m66//vr06KOPpq233jrP0V3V5ZdfXlPnBwAAACtW6B43blzq1KlT/nv8+PHV9hlUDQAAAJYhdD/++ONL8zQAAABYoSxVn24AAACgoJrubt26LbIZ+WOPPbY0hwUAAIB6ZalCd6k/d8msWbPS2LFjc//uPn361NS5AQAAwIoXuq+44ooFbr/wwgvTV199taznBAAAAPVCjfbpPvTQQ9ONN95Yk4cEAACAOqtGQ/fo0aNTkyZNavKQAAAAsGI1L99vv/2qrVdWVqaPP/44vfDCC+m8886rqXMDAACAFS90N2vWrNp6gwYN0qabbpouuuii1KNHj5o6NwAAAFjxQvdNN91U82cCAAAA9cxShe6SMWPGpNdeey3/veWWW6Ztttmmps4LAAAAVszQPWXKlHTggQemkSNHpjXXXDNv++KLL1K3bt3S7bffntZdd92aPk8AAABYMUYvP+mkk9KXX36ZXnnllfTZZ5/lZfz48WnatGnp5JNPrvmzBAAAgBUldD/00EPp2muvTZtvvnl52xZbbJEGDRqUHnzwwcU+zpNPPpn22Wef1KZNm1RRUZGGDRs236jo559/fmrdunVaZZVVUvfu3dMbb7xRrUwE/kMOOSQ1bdo017offfTR6auvvqpWZty4cWnnnXfO05m1bds2DRw4cL5zGTp0aNpss81ymQ4dOqQHHnhgCe4IAAAA1FDonjt3blp55ZXn2x7bYt/imj59eurYsWMO6wsS4fjqq69OgwcPTs8++2xabbXVUs+ePdO3335bLhOBO2rchw8fnu67774c5I877rjy/qh9jxHV27Vrl/ugX3rppenCCy9M119/fbnMqFGj0kEHHZQD+7/+9a/Uq1evvETtPQAAACytisqoTl5CP//5z3Mf7ttuuy3XUocPP/wwB+C11lor3X333Ut+IhUV+XkRdkOcVhz717/+dfrNb36Tt02dOjW1bNkyDRkyJPcpj0Hcoob9+eefT9ttt125Fn6vvfZKH3zwQX7+ddddl84555w0adKk1KhRo1zm7LPPzrXqEyZMyOsHHHBA/gEgQnvJjjvumDp16pQD/+KIcB9TqcU5Rq07dVBFRW2fAay4lvyrCKhBFf19B0JtqbzAd2BdtbgZcKlquq+55pr8Au3bt08bbbRRXjbYYIO87U9/+lOqCRMnTsxBOZqUl8QFdenSJY0ePTqvx2M0KS8F7hDlY97wqBkvldlll13KgTtEbfnrr7+ePv/883KZqq9TKlN6HQAAAPjeRi+PftEvvvhievTRR8u1xdG/e97guiwicIeo2a4q1kv74rFFixbV9jds2DCtvfba1crEDwLzHqO0L2rm43FRr7MgM2bMyEtJ/OAAAAAAS13T/dhjj+Xm3BEwozn4T37ykzySeSzbb799nqv7n//8Z1oRDBgwINe8l5b4IQIAAACWOnRfeeWV6dhjj11ge/UInr/85S/T5ZdfnmpCq1at8uPkyZOrbY/10r54jDnDq5o9e3Ye0bxqmQUdo+prLKxMaf+C9OvXL7fdLy3vv//+MlwtAAAAaUUP3S+99FLaY489Fro/RgmPEcJrQjQJj9A7YsSI8raoYY++2l27ds3r8RgDulV9zaiNjxHUo+93qUyMaD5r1qxymRjpfNNNN81Ny0tlqr5OqUzpdRakcePG+ceHqgsAAAAsdeiO2t8FTRVWtT/1J598stjHi/m0x44dm5fS4Gnx93vvvZebr5966qnpd7/7XbrnnnvSyy+/nA4//PA8InlphPPoRx4/AkTt+3PPPZeefvrpdOKJJ+aRzUujqh988MF5ELWYDiymFrvjjjvSVVddlU4//fTyeZxyyil51PPLLrss91GPKcVeeOGFfCwAAAD4XgZS+8EPfpDnrt54440XuH/cuHGpdevWi328CLbdunUrr5eCcJ8+ffK0YGeeeWaeyivm3Y4a7R//+Mc5HDdp0qT8nFtuuSWH49133z2PWt67d+88t3fVZu+PPPJI6tu3b+rcuXNq3rx5Ov/886vN5b3TTjulW2+9NZ177rnpt7/9bdpkk03ylGJbbbXVktweAAAAWPp5umPAtJEjR+Z5sasG3/DNN9+kHXbYIYfoqqF3RWGe7nrAPN1Qe8zTDbXKPN1Qe8zTXf8z4BKF7mhevu2226aVVlop1y5Hv+gQTbIHDRqU5syZk6cSm3f6rRWB0F0PCN1Qe4RuqFVCN9Qeobv+Z8Alal4eYXrUqFHphBNOyKN3l/J69L/u2bNnDt4rYuAGAACAZQ7doV27dumBBx5In3/+eXrzzTdz8I4+0KWRwAEAAIClDN0lEbK33377pX06AAAA1HtLNGUYAAAAsPiEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAAKyoobt9+/apoqJivqVv3755/2677TbfvuOPP77aMd5777209957p1VXXTW1aNEinXHGGWn27NnVyowcOTJtu+22qXHjxmnjjTdOQ4YM+V6vEwAAgPqnYVrOPf/882nOnDnl9fHjx6ef/OQn6T//8z/L24499th00UUXldcjXJfEcyNwt2rVKo0aNSp9/PHH6fDDD08rr7xy+v3vf5/LTJw4MZeJsH7LLbekESNGpGOOOSa1bt069ezZ83u7VgAAAOqX5T50r7vuutXWL7nkkrTRRhulXXfdtVrIjlC9II888kh69dVX06OPPppatmyZOnXqlC6++OJ01llnpQsvvDA1atQoDR48OG2wwQbpsssuy8/ZfPPN01NPPZWuuOIKoRsAAID627y8qpkzZ6a///3v6aijjsrNyEuidrp58+Zpq622Sv369Utff/11ed/o0aNThw4dcuAuiSA9bdq09Morr5TLdO/evdprRZnYDgAAAPW2pruqYcOGpS+++CIdccQR5W0HH3xwateuXWrTpk0aN25crsF+/fXX01133ZX3T5o0qVrgDqX12LeoMhHMv/nmm7TKKqvMdy4zZszIS0mUBQAAgDobum+44Ya055575oBdctxxx5X/jhrt6Ie9++67p7feeis3Qy/KgAEDUv/+/Qs7PgAAAHVfnWle/u677+Z+2THA2aJ06dIlP7755pv5Mfp6T548uVqZ0nqpH/jCyjRt2nSBtdwhmrFPnTq1vLz//vvLcHUAAADUR3UmdN900015uq8YZXxRxo4dmx+jxjt07do1vfzyy2nKlCnlMsOHD8+BeosttiiXiRHLq4oysX1hYmqxOEbVBQAAAOpc6J47d24O3X369EkNG/7/FvHRhDxGIh8zZkx655130j333JOnA9tll13S1ltvncv06NEjh+vDDjssvfTSS+nhhx9O5557bp7nO4JziKnC3n777XTmmWemCRMmpGuvvTbdeeed6bTTTqu1awYAAKDuqxOhO5qVv/fee3nU8qpiuq/YF8F6s802S7/+9a9T796907333lsus9JKK6X77rsvP0bN9aGHHpqDedV5vWO6sPvvvz/Xbnfs2DFPHfaXv/zFdGEAAAAsk4rKysrKZTsEpdHLmzVrlvt3a2peR1WZhg74nvkqglpV0d93INSWygt8B9b3DFgnaroBAACgLhK6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAAK2LovvDCC1NFRUW1ZbPNNivv//bbb1Pfvn3TOuusk1ZfffXUu3fvNHny5GrHeO+999Lee++dVl111dSiRYt0xhlnpNmzZ1crM3LkyLTtttumxo0bp4033jgNGTLke7tGAAAA6q/lOnSHLbfcMn388cfl5amnnirvO+2009K9996bhg4dmp544on00Ucfpf3226+8f86cOTlwz5w5M40aNSrdfPPNOVCff/755TITJ07MZbp165bGjh2bTj311HTMMcekhx9++Hu/VgAAAOqXhmk517Bhw9SqVav5tk+dOjXdcMMN6dZbb03/8R//kbfddNNNafPNN0/PPPNM2nHHHdMjjzySXn311fToo4+mli1bpk6dOqWLL744nXXWWbkWvVGjRmnw4MFpgw02SJdddlk+Rjw/gv0VV1yRevbs+b1fLwAAAPXHcl/T/cYbb6Q2bdqkDTfcMB1yyCG5uXgYM2ZMmjVrVurevXu5bDQ9X3/99dPo0aPzejx26NAhB+6SCNLTpk1Lr7zySrlM1WOUypSOAQAAAPWyprtLly65Ofimm26am5b3798/7bzzzmn8+PFp0qRJuaZ6zTXXrPacCNixL8Rj1cBd2l/at6gyEcy/+eabtMoqqyzw3GbMmJGXkigPAAAAdSZ077nnnuW/t9566xzC27Vrl+68886FhuHvy4ABA/KPAAAAAFBnm5dXFbXaP/zhD9Obb76Z+3nHAGlffPFFtTIxenmpD3g8zjuaeWn9u8o0bdp0kcG+X79+uV95aXn//fdr7DoBAACoH+pU6P7qq6/SW2+9lVq3bp06d+6cVl555TRixIjy/tdffz33+e7atWtej8eXX345TZkypVxm+PDhOVBvscUW5TJVj1EqUzrGwsT0YnGcqgsAAADUmdD9m9/8Jk8F9s477+Qpv/bdd9+00korpYMOOig1a9YsHX300en0009Pjz/+eB5Y7cgjj8xhOUYuDz169Mjh+rDDDksvvfRSngbs3HPPzXN7R2gOxx9/fHr77bfTmWeemSZMmJCuvfba3Hw9piMDAACAetun+4MPPsgB+9NPP03rrrtu+vGPf5ynA4u/Q0zr1aBBg9S7d+88qFmMOh6huSQC+n333ZdOOOGEHMZXW2211KdPn3TRRReVy8R0Yffff38O2VdddVVab7310l/+8hfThQEAALDMKiorKyuX/TDE6OVR+x79uzU1r6MqKmr7DGDF5asIalVFf9+BUFsqL/AdWN8z4HLdvBwAAADqMqEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAACwIobuAQMGpO233z6tscYaqUWLFqlXr17p9ddfr1Zmt912SxUVFdWW448/vlqZ9957L+29995p1VVXzcc544wz0uzZs6uVGTlyZNp2221T48aN08Ybb5yGDBnyvVwjAAAA9ddyHbqfeOKJ1Ldv3/TMM8+k4cOHp1mzZqUePXqk6dOnVyt37LHHpo8//ri8DBw4sLxvzpw5OXDPnDkzjRo1Kt188805UJ9//vnlMhMnTsxlunXrlsaOHZtOPfXUdMwxx6SHH374e71eAAAA6peKysrKylRHfPLJJ7mmOsL4LrvsUq7p7tSpU7ryyisX+JwHH3ww/fSnP00fffRRatmyZd42ePDgdNZZZ+XjNWrUKP99//33p/Hjx5efd+CBB6YvvvgiPfTQQ4t1btOmTUvNmjVLU6dOTU2bNq2R6+V7VlFR22cAK66681UE9VJFf9+BUFsqL/AdWFctbgZcrmu65xUXE9Zee+1q22+55ZbUvHnztNVWW6V+/fqlr7/+urxv9OjRqUOHDuXAHXr27Jlv0CuvvFIu071792rHjDKxHQAAAJZWw1RHzJ07Nzf7/tGPfpTDdcnBBx+c2rVrl9q0aZPGjRuXa62j3/ddd92V90+aNKla4A6l9di3qDIRzL/55pu0yiqrzHc+M2bMyEtJlAUAAIA6Gbqjb3c0/37qqaeqbT/uuOPKf0eNduvWrdPuu++e3nrrrbTRRhsVOshb//79Czs+AAAAdV+daF5+4oknpvvuuy89/vjjab311ltk2S5duuTHN998Mz+2atUqTZ48uVqZ0nrsW1SZaJe/oFruEM3Yo7l7aXn//feX4QoBAACoj5br0B1jvEXgvvvuu9Njjz2WNthgg+98Tow+HqLGO3Tt2jW9/PLLacqUKeUyMRJ6BOotttiiXGbEiBHVjhNlYvvCxNRicYyqCwAAANSZ0B1Nyv/+97+nW2+9Nc/VHX2vY4l+1iGakF988cVpzJgx6Z133kn33HNPOvzww/PI5ltvvXUuE1OMRbg+7LDD0ksvvZSnATv33HPzsSM4h5jX++23305nnnlmmjBhQrr22mvTnXfemU477bRavX4AAADqtuV6yrCKhUzhdNNNN6UjjjgiN+k+9NBDc1/vmLu7bdu2ad99982humrN87vvvptOOOGENHLkyLTaaqulPn36pEsuuSQ1bPj/u7THvgjZr776am7Cft555+XXWFymDKsHTBkGtWf5/SqCFYIpw6D2mDKs7lrcDLhch+66ROiuB4RuqD2+iqBWCd1Qe4TuuqteztMNAAAAdYnQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAABRE6AYAAICCCN0AAABQEKEbAAAACiJ0AwAAQEGEbgAAACiI0A0AAAAFEboBAACgIEI3AAAAFEToBgAAgIII3QAAAFAQoRsAAAAKInQDAABAQYRuAAAAKIjQPY9Bgwal9u3bpyZNmqQuXbqk5557rrZPCQAAgDpK6K7ijjvuSKeffnq64IIL0osvvpg6duyYevbsmaZMmVLbpwYAAEAdJHRXcfnll6djjz02HXnkkWmLLbZIgwcPTquuumq68cYba/vUAAAAqIOE7v81c+bMNGbMmNS9e/fytgYNGuT10aNH1+q5AQAAUDc1rO0TWF78+9//TnPmzEktW7astj3WJ0yYMF/5GTNm5KVk6tSp+XHatGnfw9kC1DP+2wm169vaPgFYcckPdf+9q6ysXGQ5oXspDRgwIPXv33++7W3btq2V8wGo05o1q+0zAIBa0ewS34F13ZdffpmaLeL/ZYTu/9W8efO00korpcmTJ1fbHuutWrWar3y/fv3yoGslc+fOTZ999llaZ511UkVFxfdyzsD//5UxfvB6//33U9OmTWv7dADge+M7EGpP1HBH4G7Tps0iywnd/6tRo0apc+fOacSIEalXr17lIB3rJ5544nzlGzdunJeq1lxzze/tfIH5xf9s+B8OAFZEvgOhdiyqhrtE6K4iaq779OmTtttuu7TDDjukK6+8Mk2fPj2PZg4AAABLSuiu4oADDkiffPJJOv/889OkSZNSp06d0kMPPTTf4GoAAACwOITueURT8gU1JweWX9HV44ILLpivywcA1He+A2H5V1H5XeObAwAAAEulwdI9DQAAAPguQjcAAAAUROgGAACAggjdQJ315JNPpn322Se1adMmVVRUpGHDhtX2KQHA92bQoEGpffv2qUmTJqlLly7pueeeq+1TAhZA6AbqrOnTp6eOHTvm/+kAgBXJHXfckU4//fQ8cvmLL76Yvw979uyZpkyZUtunBszD6OVAvRA13XfffXfq1atXbZ8KABQuara33377dM011+T1uXPnprZt26aTTjopnX322bV9ekAVaroBAKAOmTlzZhozZkzq3r17eVuDBg3y+ujRo2v13ID5Cd0AAFCH/Pvf/05z5sxJLVu2rLY91idNmlRr5wUsmNANAAAABRG6AQCgDmnevHlaaaWV0uTJk6ttj/VWrVrV2nkBCyZ0AwBAHdKoUaPUuXPnNGLEiPK2GEgt1rt27Vqr5wbMr+ECtgHUCV999VV68803y+sTJ05MY8eOTWuvvXZaf/31a/XcAKBIMV1Ynz590nbbbZd22GGHdOWVV+apNI888sjaPjVgHqYMA+qskSNHpm7dus23Pf4nZMiQIbVyTgDwfYnpwi699NI8eFqnTp3S1VdfnacSA5YvQjcAAAAURJ9uAAAAKIjQDQAAAAURugEAAKAgQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AWEZDhgxJa6655jIfp6KiIg0bNmyRZT799NPUokWL9M4776Tl0YUXXpg6depU26ex3Kr6Hv/73//O7+UHH3xQ26cFQIGEbgBWeEcccUTq1atXqgv+67/+K/385z9P7du3L29777330t57751WXXXVHOLOOOOMNHv27KV+jd122y2Hw3mXZTlmTXrllVdS79698z2I87ryyiuX+ZhVr7Np06Zp++23T//4xz9SkZo3b54OP/zwdMEFFxT6OgDULqEbAOqIr7/+Ot1www3p6KOPLm+bM2dODtwzZ85Mo0aNSjfffHOueT///POX6bWOPfbY9PHHH1dbGjZsmJaX+7DhhhumSy65JLVq1arGjnvTTTfl63zhhRfSj370o7T//vunl19+ORXpyCOPTLfcckv67LPPCn0dAGqP0A0A3+Hyyy9PHTp0SKuttlpq27Zt+tWvfpW++uqr+cpFs+FNNtkkNWnSJPXs2TO9//771fZHzem2226b90do7N+//xLVHj/wwAOpcePGaccddyxve+SRR9Krr76a/v73v+dm3XvuuWe6+OKL06BBg3IQX1pRax6BtuoSzjrrrPTDH/4w749rOO+889KsWbMWepy33norlzvxxBNTZWVlmjFjRvrNb36TfvCDH+T72aVLlzRy5MglOreohb700kvTgQcemO9HTYkuAnGdcX1xD+O9efzxx8v74/38xS9+kcutvfbaucVB1Wb+zz//fPrJT36Sa7CbNWuWdt111/Tiiy8u8jW33HLL1KZNm3T33XfX2HUAsHwRugHgOzRo0CBdffXVuVlz1CQ/9thj6cwzz5yv9jWafv/1r39NTz/9dPriiy9yKCz55z//mZsSn3LKKTkk//d//3eukY7nLK44RufOnattGz16dP5BoGXLluVtEfinTZuWz7f0vNVXX32RS9S2Lo411lgjn3dcw1VXXZX+/Oc/pyuuuGKBZceNG5d+/OMfp4MPPjhdc801uel2hO8459tvvz3v/8///M+0xx57pDfeeCPVpN///vffec3RLH9BImxHi4LQqFGj/Bg/LMR9jeuP+xnvcRwjzr3048aXX36Z+vTpk5566qn0zDPP5B9g9tprr7x9UXbYYYd8TADqqUoAWMH16dOn8uc///lilx86dGjlOuusU16/6aabKuMr9Zlnnilve+211/K2Z599Nq/vvvvulb///e+rHedvf/tbZevWrcvrUf7uu+9e6OvGOR511FHVth177LGVPXr0qLZt+vTp+VgPPPBAXv/6668r33jjjUUu06ZNKz9/1113rVx55ZUrV1tttfJy+umnL/CcLr300srOnTuX1y+44ILKjh07Vj799NOVa621VuUf//jH8r533323cqWVVqr88MMPqx0j7k2/fv0ql0a7du0qr7jiivm2f/rpp995zbNmzSqXj/vVpEmTfK0NGjTI6+3bt8/HKb1Xm266aeXcuXPLz5kxY0blKqusUvnwww8v8NzmzJlTucYaa1Tee++9i3yPTzvttMrddtttqa4fgOXf8tE5CwCWY48++mgaMGBAmjBhQq5BjprQb7/9NtduRzPrEP2do9lzyWabbZabIb/22mu5JvOll17KtaNVa7ajP/a8x1mUb775JjdNX1KrrLJK2njjjZfoOYccckg655xzyuul0dnvuOOOXOsfzcajiX3cixh4rKqoQY5m1nGtp556anl79I+Oa47m21VFk/N11lkn1aRo/h3Lkoga++7du6e33347nXbaafk6S8eI9+/NN9/MNd1VxfsX9yJMnjw5nXvuubm5/JQpU/K1xnu7sBr1qu9PlAOgfhK6AWARos/uT3/603TCCSfkEBkhLJoPx2Bm0ax4ccJyiIAafbj322+/+fYtbpCOvsKff/55tW3RB/m5556rti3CX2lfiKbL0dd7UaK5ewTtkuiTPG9Qj2bhUSauI5paR5loJn7ZZZdVK7fuuuvmfsq33XZbOuqoo8qhPO7BSiutlMaMGZMfq4qm2jXdvDyWRYkm8uuvv355Pe5XXHMsMahaNA2PMjEifJx7NO1fUDP8uN4QTctjSrdodt+uXbvc37xr167f2bc+BlErHQOA+kfoBoBFiIA4d+7cHCyjb3e488475ysXNb4x6nXUaofXX3899+vefPPN83oMoBbblrTGuaptttkmD5hWVYS6+DEgalYjHIbhw4fnoLvFFlvk9e222y6NHTt2kceu2id8YWJ09AiTVWvA33333QXW3N533305tEY4j8HeooY4zj9qf+Ncd95551Sk448/Pg96tijxw8DCxPsYITvubYToeP+ilj/u8bw1+yXRkuHaa6/N110aeC3m4v4u48ePz9O0AVA/Cd0AkFKaOnXqfME0mjxHSI5BtP70pz+lffbZJwerwYMHz/f8lVdeOZ100km5SXI0NY8Bw2KU8VIIjym8osY8alZjKqoI8NFkOQLX7373u8U6xwiw/fr1y7Xda621Vt7Wo0ePHK4PO+ywNHDgwDRp0qTcxLlv377lkb2Xpnn5gsTAYNFUOmq3oyn9/fffv9BRt2Nk8tgfNeyxPPTQQ7lZedSUx4By8SNGhPBPPvkkjRgxIm299dZ56rPFETXHUQNd+vvDDz/M713Ulpeuc2mal88rmsbvu+++edC8OO8YMT1GLL/ooovSeuutl39wuOuuu/L+WI/787e//S3/yBHdEGK+9Lj3ixLNyuOHne+qlQeg7jJ6OQCklPvhRgisukQz6o4dO+Ypw/7whz+krbbaKjcvjv7d84pm5jGdVozUHXM8RwCMmtGqgTlqf6PWNwJrBPLoQxw1x4srRimPGteqNe3RTDuOG49R633ooYfmUBvBsKb97Gc/y32d4weFmJ4sar5jyrCFiXvw4IMP5qnCIlBPnz49N9uO8/v1r3+dNt1009SrV6881VbVZt4xynmMkL4wH330Ufk9inm1//jHP+a/jznmmBq93hiZfIMNNsi13fH+Pvnkk/k8o4tAtGCILgbRp7tU8x0jnscPIvEexY8gJ598crn1wcLENHJxzKJr/gGoPRUxmlotvj4AsASi9jhqUKOGvNTcvT6ZOHFirhGPmuyoOa7v4seXCOfxYw0A9ZPm5QBQh0SNccxpHU2q27Ztm+qbBx54IB133HErROCO/t5Ra37QQQfV9qkAUCA13QAAAFCQ+tcuDQAAAJYTQjcAAAAUROgGAACAggjdAAAAUBChGwAAAAoidAMAAEBBhG4AAAAoiNANAAAABRG6AQAAoCBCNwAAAKRi/D9LDOiyvH123QAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analyze label distribution\n", + "print(\"=== LABEL DISTRIBUTION ===\")\n", + "print(\"Training data:\")\n", + "print(train_data['label'].value_counts())\n", + "print(f\"\\nFake news percentage: {(train_data['label'] == 0).mean() * 100:.2f}%\")\n", + "print(f\"Real news percentage: {(train_data['label'] == 1).mean() * 100:.2f}%\")\n", + "\n", + "# Visualize label distribution\n", + "plt.figure(figsize=(10, 6))\n", + "train_data['label'].value_counts().plot(kind='bar', color=['red', 'green'])\n", + "plt.title('Label Distribution in Training Data')\n", + "plt.xlabel('Label (0=Fake, 1=Real)')\n", + "plt.ylabel('Count')\n", + "plt.xticks(rotation=0)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== SUBJECT DISTRIBUTION ===\n", + "subject\n", + "politicsNews 11272\n", + "News 9050\n", + "worldnews 8727\n", + "politics 6841\n", + "left-news 2482\n", + "Government News 1570\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analyze subject distribution\n", + "print(\"=== SUBJECT DISTRIBUTION ===\")\n", + "print(train_data['subject'].value_counts())\n", + "\n", + "# Visualize subject distribution\n", + "plt.figure(figsize=(12, 8))\n", + "train_data['subject'].value_counts().plot(kind='bar')\n", + "plt.title('Subject Distribution in Training Data')\n", + "plt.xlabel('Subject')\n", + "plt.ylabel('Count')\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== DATE ANALYSIS ===\n", + "Date parsing results:\n", + "Training data - Successfully parsed: 39,934 (100.0%)\n", + "Training data - Parsing errors: 8 (0.0%)\n", + "Validation data - Successfully parsed: 4,954 (100.0%)\n", + "Validation data - Parsing errors: 2 (0.0%)\n", + "\n", + "Sample problematic dates from training data:\n", + "29357 https://100percentfedup.com/served-roy-moore-v...\n", + "35506 https://100percentfedup.com/video-hillary-aske...\n", + "35507 https://100percentfedup.com/12-yr-old-black-co...\n", + "35838 https://fedup.wpengine.com/wp-content/uploads/...\n", + "35839 https://fedup.wpengine.com/wp-content/uploads/...\n", + "37431 https://fedup.wpengine.com/wp-content/uploads/...\n", + "37432 https://fedup.wpengine.com/wp-content/uploads/...\n", + "38932 MSNBC HOST Rudely Assumes Steel Worker Would N...\n", + "Name: date_clean, dtype: object\n", + "\n", + "Date range in training data: 2015-03-31 00:00:00 to 2018-02-19 00:00:00\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analyze date distribution with comprehensive parsing\n", + "print(\"=== DATE ANALYSIS ===\")\n", + "\n", + "# Clean date strings by removing extra spaces\n", + "train_data['date_clean'] = train_data['date'].str.strip()\n", + "validation_data['date_clean'] = validation_data['date'].str.strip()\n", + "\n", + "# Function to parse dates with multiple formats\n", + "def parse_dates_robust(date_series):\n", + " \"\"\"Parse dates with multiple format support\"\"\"\n", + " # Try different date formats\n", + " formats = [\n", + " '%B %d, %Y', # December 20, 2017\n", + " '%d-%b-%y', # 19-Feb-18\n", + " '%Y-%m-%d', # 2017-12-20\n", + " '%m/%d/%Y', # 12/20/2017\n", + " '%d/%m/%Y', # 20/12/2017\n", + " ]\n", + " \n", + " parsed_dates = pd.Series([pd.NaT] * len(date_series), index=date_series.index)\n", + " \n", + " for fmt in formats:\n", + " try:\n", + " # Try to parse with current format\n", + " temp_parsed = pd.to_datetime(date_series, format=fmt, errors='coerce')\n", + " # Update only the ones that weren't parsed yet\n", + " mask = parsed_dates.isna() & temp_parsed.notna()\n", + " parsed_dates[mask] = temp_parsed[mask]\n", + " except:\n", + " continue\n", + " \n", + " # Try pandas automatic parsing for any remaining\n", + " remaining_mask = parsed_dates.isna()\n", + " if remaining_mask.any():\n", + " try:\n", + " auto_parsed = pd.to_datetime(date_series[remaining_mask], errors='coerce')\n", + " parsed_dates[remaining_mask] = auto_parsed\n", + " except:\n", + " pass\n", + " \n", + " return parsed_dates\n", + "\n", + "# Parse dates\n", + "train_data['date_parsed'] = parse_dates_robust(train_data['date_clean'])\n", + "validation_data['date_parsed'] = parse_dates_robust(validation_data['date_clean'])\n", + "\n", + "# Check parsing results\n", + "train_parse_errors = train_data['date_parsed'].isna().sum()\n", + "val_parse_errors = validation_data['date_parsed'].isna().sum()\n", + "\n", + "print(f\"Date parsing results:\")\n", + "print(f\"Training data - Successfully parsed: {len(train_data) - train_parse_errors:,} ({((len(train_data) - train_parse_errors) / len(train_data) * 100):.1f}%)\")\n", + "print(f\"Training data - Parsing errors: {train_parse_errors:,} ({train_parse_errors / len(train_data) * 100:.1f}%)\")\n", + "print(f\"Validation data - Successfully parsed: {len(validation_data) - val_parse_errors:,} ({((len(validation_data) - val_parse_errors) / len(validation_data) * 100):.1f}%)\")\n", + "print(f\"Validation data - Parsing errors: {val_parse_errors:,} ({val_parse_errors / len(validation_data) * 100:.1f}%)\")\n", + "\n", + "if train_parse_errors > 0:\n", + " print(\"\\nSample problematic dates from training data:\")\n", + " print(train_data[train_data['date_parsed'].isna()]['date_clean'].head(10))\n", + "\n", + "# Show date range for successfully parsed dates\n", + "if train_parse_errors < len(train_data):\n", + " valid_dates = train_data['date_parsed'].dropna()\n", + " print(f\"\\nDate range in training data: {valid_dates.min()} to {valid_dates.max()}\")\n", + " \n", + " # Visualize date distribution by year\n", + " plt.figure(figsize=(12, 6))\n", + " valid_dates.dt.year.value_counts().sort_index().plot(kind='bar')\n", + " plt.title('Article Distribution by Year (Successfully Parsed Dates)')\n", + " plt.xlabel('Year')\n", + " plt.ylabel('Number of Articles')\n", + " plt.xticks(rotation=45)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "else:\n", + " print(\"\\nNo dates could be parsed successfully.\")\n", + " print(\"\\nSample date values:\")\n", + " print(train_data['date_clean'].head(10))\n", + " print(\"\\nUnique date formats:\")\n", + " print(train_data['date_clean'].value_counts().head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TEXT AND TITLE LENGTH ANALYSIS ===\n", + "Text length statistics:\n", + "count 39942.000000\n", + "mean 2384.637875\n", + "std 1765.895463\n", + "min 1.000000\n", + "25% 1259.000000\n", + "50% 2197.000000\n", + "75% 3069.000000\n", + "max 49705.000000\n", + "Name: text_length, dtype: float64\n", + "\n", + "Title length statistics:\n", + "count 39942.000000\n", + "mean 79.774723\n", + "std 24.811136\n", + "min 8.000000\n", + "25% 63.000000\n", + "50% 73.000000\n", + "75% 90.000000\n", + "max 286.000000\n", + "Name: title_length, dtype: float64\n", + "\n", + "Word count statistics:\n", + "count 39942.000000\n", + "mean 391.735867\n", + "std 286.378195\n", + "min 0.000000\n", + "25% 207.000000\n", + "50% 363.500000\n", + "75% 506.000000\n", + "max 7033.000000\n", + "Name: word_count, dtype: float64\n", + "\n", + "Title word count statistics:\n", + "count 39942.000000\n", + "mean 12.392870\n", + "std 4.038761\n", + "min 1.000000\n", + "25% 10.000000\n", + "50% 11.000000\n", + "75% 14.000000\n", + "max 42.000000\n", + "Name: title_word_count, dtype: float64\n" + ] + } + ], + "source": [ + "# Analyze text and title lengths\n", + "print(\"=== TEXT AND TITLE LENGTH ANALYSIS ===\")\n", + "\n", + "# Calculate lengths\n", + "train_data['text_length'] = train_data['text'].str.len()\n", + "train_data['title_length'] = train_data['title'].str.len()\n", + "train_data['word_count'] = train_data['text'].str.split().str.len()\n", + "train_data['title_word_count'] = train_data['title'].str.split().str.len()\n", + "\n", + "print(f\"Text length statistics:\")\n", + "print(train_data['text_length'].describe())\n", + "print(f\"\\nTitle length statistics:\")\n", + "print(train_data['title_length'].describe())\n", + "print(f\"\\nWord count statistics:\")\n", + "print(train_data['word_count'].describe())\n", + "print(f\"\\nTitle word count statistics:\")\n", + "print(train_data['title_word_count'].describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== LENGTH DISTRIBUTIONS BY LABEL ===\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize length distributions by label\n", + "print(\"=== LENGTH DISTRIBUTIONS BY LABEL ===\")\n", + "\n", + "# Separate fake and real news\n", + "fake_news = train_data[train_data['label'] == 0]\n", + "real_news = train_data[train_data['label'] == 1]\n", + "\n", + "# Create subplots\n", + "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n", + "\n", + "axes[0, 0].hist(fake_news['text_length'], bins=50, alpha=0.7, label='Fake', color='red')\n", + "axes[0, 0].hist(real_news['text_length'], bins=50, alpha=0.7, label='Real', color='green')\n", + "axes[0, 0].set_title('Text Length Distribution by Label')\n", + "axes[0, 0].set_xlabel('Text Length (characters)')\n", + "axes[0, 0].set_ylabel('Frequency')\n", + "axes[0, 0].legend()\n", + "\n", + "axes[0, 1].hist(fake_news['title_length'], bins=30, alpha=0.7, label='Fake', color='red')\n", + "axes[0, 1].hist(real_news['title_length'], bins=30, alpha=0.7, label='Real', color='green')\n", + "axes[0, 1].set_title('Title Length Distribution by Label')\n", + "axes[0, 1].set_xlabel('Title Length (characters)')\n", + "axes[0, 1].set_ylabel('Frequency')\n", + "axes[0, 1].legend()\n", + "\n", + "axes[1, 0].hist(fake_news['word_count'], bins=50, alpha=0.7, label='Fake', color='red')\n", + "axes[1, 0].hist(real_news['word_count'], bins=50, alpha=0.7, label='Real', color='green')\n", + "axes[1, 0].set_title('Word Count Distribution by Label')\n", + "axes[1, 0].set_xlabel('Word Count')\n", + "axes[1, 0].set_ylabel('Frequency')\n", + "axes[1, 0].legend()\n", + "\n", + "axes[1, 1].hist(fake_news['title_word_count'], bins=20, alpha=0.7, label='Fake', color='red')\n", + "axes[1, 1].hist(real_news['title_word_count'], bins=20, alpha=0.7, label='Real', color='green')\n", + "axes[1, 1].set_title('Title Word Count Distribution by Label')\n", + "axes[1, 1].set_xlabel('Title Word Count')\n", + "axes[1, 1].set_ylabel('Frequency')\n", + "axes[1, 1].legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Text Preprocessing Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text preprocessing functions defined!\n" + ] + } + ], + "source": [ + "# Text preprocessing functions\n", + "def clean_text(text):\n", + " \"\"\"Clean and preprocess text data\"\"\"\n", + " if pd.isna(text):\n", + " return \"\"\n", + " \n", + " # Convert to lowercase\n", + " text = text.lower()\n", + " \n", + " # Remove special characters and digits\n", + " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", + " \n", + " # Remove extra whitespace\n", + " text = ' '.join(text.split())\n", + " \n", + " return text\n", + "\n", + "def remove_stopwords(text):\n", + " \"\"\"Remove stopwords from text\"\"\"\n", + " if pd.isna(text):\n", + " return \"\"\n", + " \n", + " try:\n", + " stop_words = set(stopwords.words('english'))\n", + " words = word_tokenize(text)\n", + " filtered_words = [word for word in words if word.lower() not in stop_words]\n", + " return ' '.join(filtered_words)\n", + " except:\n", + " return text\n", + "\n", + "def lemmatize_text(text):\n", + " \"\"\"Lemmatize text\"\"\"\n", + " if pd.isna(text):\n", + " return \"\"\n", + " \n", + " try:\n", + " lemmatizer = WordNetLemmatizer()\n", + " words = word_tokenize(text)\n", + " lemmatized_words = [lemmatizer.lemmatize(word) for word in words]\n", + " return ' '.join(lemmatized_words)\n", + " except:\n", + " return text\n", + "\n", + "def preprocess_text(text):\n", + " \"\"\"Complete text preprocessing pipeline\"\"\"\n", + " text = clean_text(text)\n", + " text = remove_stopwords(text)\n", + " text = lemmatize_text(text)\n", + " return text\n", + "\n", + "print(\"Text preprocessing functions defined!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Word Frequency Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== WORD FREQUENCY ANALYSIS ===\n", + "Most common words in all articles:\n", + "trump: 122117\n", + "said: 118254\n", + "u: 54189\n", + "state: 51728\n", + "would: 49438\n", + "president: 48701\n", + "republican: 37084\n", + "people: 36420\n", + "one: 31535\n", + "year: 30011\n", + "reuters: 26711\n", + "new: 26710\n", + "also: 26608\n", + "donald: 25991\n", + "house: 25842\n", + "clinton: 24311\n", + "government: 24207\n", + "time: 22899\n", + "obama: 21929\n", + "say: 21899\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analyze most common words\n", + "print(\"=== WORD FREQUENCY ANALYSIS ===\")\n", + "\n", + "# Get all text\n", + "all_text = ' '.join(train_data['text'].astype(str))\n", + "all_text_clean = preprocess_text(all_text)\n", + "\n", + "# Count word frequencies\n", + "words = all_text_clean.split()\n", + "word_freq = Counter(words)\n", + "most_common_words = word_freq.most_common(20)\n", + "\n", + "print(\"Most common words in all articles:\")\n", + "for word, count in most_common_words:\n", + " print(f\"{word}: {count}\")\n", + "\n", + "# Visualize most common words\n", + "words_list, counts_list = zip(*most_common_words)\n", + "\n", + "plt.figure(figsize=(12, 8))\n", + "plt.barh(range(len(words_list)), counts_list)\n", + "plt.yticks(range(len(words_list)), words_list)\n", + "plt.xlabel('Frequency')\n", + "plt.title('Most Common Words in All Articles')\n", + "plt.gca().invert_yaxis()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== WORD FREQUENCY COMPARISON ===\n", + "Most common words in FAKE news:\n", + "trump: 68316\n", + "said: 25868\n", + "president: 22858\n", + "people: 22297\n", + "would: 19413\n", + "one: 19366\n", + "state: 17413\n", + "donald: 15737\n", + "like: 15146\n", + "republican: 15012\n", + "\n", + "Most common words in REAL news:\n", + "said: 92386\n", + "trump: 53801\n", + "u: 39802\n", + "state: 34315\n", + "would: 30025\n", + "reuters: 26456\n", + "president: 25843\n", + "republican: 22072\n", + "year: 17926\n", + "government: 17909\n" + ] + } + ], + "source": [ + "# Compare word frequencies between fake and real news\n", + "print(\"=== WORD FREQUENCY COMPARISON ===\")\n", + "\n", + "# Get text for fake and real news separately\n", + "fake_text = ' '.join(fake_news['text'].astype(str))\n", + "real_text = ' '.join(real_news['text'].astype(str))\n", + "\n", + "fake_text_clean = preprocess_text(fake_text)\n", + "real_text_clean = preprocess_text(real_text)\n", + "\n", + "# Count frequencies\n", + "fake_words = fake_text_clean.split()\n", + "real_words = real_text_clean.split()\n", + "\n", + "fake_freq = Counter(fake_words)\n", + "real_freq = Counter(real_words)\n", + "\n", + "print(\"Most common words in FAKE news:\")\n", + "for word, count in fake_freq.most_common(10):\n", + " print(f\"{word}: {count}\")\n", + "\n", + "print(\"\\nMost common words in REAL news:\")\n", + "for word, count in real_freq.most_common(10):\n", + " print(f\"{word}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Sample Articles Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== SAMPLE ARTICLES ===\n", + "\n", + "📰 SAMPLE FAKE NEWS ARTICLES:\n", + "\n", + "--- Article 19999 ---\n", + "Title: Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing\n", + "Subject: News\n", + "Date: December 31, 2017\n", + "Text (first 200 chars): Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former rea...\n", + "\n", + "--- Article 20000 ---\n", + "Title: Drunk Bragging Trump Staffer Started Russian Collusion Investigation\n", + "Subject: News\n", + "Date: December 31, 2017\n", + "Text (first 200 chars): House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investig...\n", + "\n", + "--- Article 20001 ---\n", + "Title: Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’\n", + "Subject: News\n", + "Date: December 30, 2017\n", + "Text (first 200 chars): On Friday, it was revealed that former Milwaukee Sheriff David Clarke, who was being considered for Homeland Security Secretary in Donald Trump s administration, has an email scandal of his own.In Jan...\n", + "\n", + "📰 SAMPLE REAL NEWS ARTICLES:\n", + "\n", + "--- Article 0 ---\n", + "Title: As U.S. budget fight looms, Republicans flip their fiscal script\n", + "Subject: politicsNews\n", + "Date: December 31, 2017 \n", + "Text (first 200 chars): WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal ...\n", + "\n", + "--- Article 1 ---\n", + "Title: U.S. military to accept transgender recruits on Monday: Pentagon\n", + "Subject: politicsNews\n", + "Date: December 29, 2017 \n", + "Text (first 200 chars): WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after Presiden...\n", + "\n", + "--- Article 2 ---\n", + "Title: Senior U.S. Republican senator: 'Let Mr. Mueller do his job'\n", + "Subject: politicsNews\n", + "Date: December 31, 2017 \n", + "Text (first 200 chars): WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trump’s 2016 election campaign should continue without interference in 2018, despite calls from some Trum...\n" + ] + } + ], + "source": [ + "# Display sample articles\n", + "print(\"=== SAMPLE ARTICLES ===\")\n", + "\n", + "print(\"\\n📰 SAMPLE FAKE NEWS ARTICLES:\")\n", + "fake_samples = fake_news.head(3)\n", + "for idx, row in fake_samples.iterrows():\n", + " print(f\"\\n--- Article {idx} ---\")\n", + " print(f\"Title: {row['title']}\")\n", + " print(f\"Subject: {row['subject']}\")\n", + " print(f\"Date: {row['date']}\")\n", + " print(f\"Text (first 200 chars): {row['text'][:200]}...\")\n", + "\n", + "print(\"\\n📰 SAMPLE REAL NEWS ARTICLES:\")\n", + "real_samples = real_news.head(3)\n", + "for idx, row in real_samples.iterrows():\n", + " print(f\"\\n--- Article {idx} ---\")\n", + " print(f\"Title: {row['title']}\")\n", + " print(f\"Subject: {row['subject']}\")\n", + " print(f\"Date: {row['date']}\")\n", + " print(f\"Text (first 200 chars): {row['text'][:200]}...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Data Quality Assessment" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== DATA QUALITY ASSESSMENT ===\n", + "Duplicate rows - Training: 201, Validation: 8\n", + "Empty text - Training: 610, Validation: 21\n", + "Very short text (<50 chars) - Training: 795, Validation: 41\n", + "Very long text (>10000 chars) - Training: 137, Validation: 215\n" + ] + } + ], + "source": [ + "# Assess data quality\n", + "print(\"=== DATA QUALITY ASSESSMENT ===\")\n", + "\n", + "# Check for duplicates\n", + "train_duplicates = train_data.duplicated().sum()\n", + "val_duplicates = validation_data.duplicated().sum()\n", + "print(f\"Duplicate rows - Training: {train_duplicates}, Validation: {val_duplicates}\")\n", + "\n", + "# Check for empty text\n", + "train_empty_text = (train_data['text'].str.strip() == '').sum()\n", + "val_empty_text = (validation_data['text'].str.strip() == '').sum()\n", + "print(f\"Empty text - Training: {train_empty_text}, Validation: {val_empty_text}\")\n", + "\n", + "# Check for very short text\n", + "train_short_text = (train_data['text'].str.len() < 50).sum()\n", + "val_short_text = (validation_data['text'].str.len() < 50).sum()\n", + "print(f\"Very short text (<50 chars) - Training: {train_short_text}, Validation: {val_short_text}\")\n", + "\n", + "# Check for very long text\n", + "train_long_text = (train_data['text'].str.len() > 10000).sum()\n", + "val_long_text = (validation_data['text'].str.len() > 10000).sum()\n", + "print(f\"Very long text (>10000 chars) - Training: {train_long_text}, Validation: {val_long_text}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Summary Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== SUMMARY STATISTICS ===\n", + "\n", + "📊 DATASET OVERVIEW:\n", + "Training articles: 39,942\n", + "Validation articles: 4,956\n", + "Total articles: 44,898\n", + "\n", + "📈 LABEL DISTRIBUTION:\n", + "Fake news: 19,943 (49.9%)\n", + "Real news: 19,999 (50.1%)\n", + "\n", + "📝 TEXT STATISTICS:\n", + "Average text length: 2385 characters\n", + "Average word count: 392 words\n", + "Average title length: 80 characters\n", + "Average title word count: 12 words\n", + "\n", + "📅 DATE RANGE:\n", + "From: 2015-03-31 00:00:00\n", + "To: 2018-02-19 00:00:00\n", + "Span: 1056 days\n", + "Successfully parsed: 39,934 (100.0%)\n", + "\n", + "🏷️ SUBJECTS:\n", + "politicsNews: 11,272 articles\n", + "News: 9,050 articles\n", + "worldnews: 8,727 articles\n", + "politics: 6,841 articles\n", + "left-news: 2,482 articles\n", + "Government News: 1,570 articles\n" + ] + } + ], + "source": [ + "# Summary statistics\n", + "print(\"=== SUMMARY STATISTICS ===\")\n", + "\n", + "print(f\"\\n📊 DATASET OVERVIEW:\")\n", + "print(f\"Training articles: {len(train_data):,}\")\n", + "print(f\"Validation articles: {len(validation_data):,}\")\n", + "print(f\"Total articles: {len(train_data) + len(validation_data):,}\")\n", + "\n", + "print(f\"\\n📈 LABEL DISTRIBUTION:\")\n", + "print(f\"Fake news: {(train_data['label'] == 0).sum():,} ({(train_data['label'] == 0).mean() * 100:.1f}%)\")\n", + "print(f\"Real news: {(train_data['label'] == 1).sum():,} ({(train_data['label'] == 1).mean() * 100:.1f}%)\")\n", + "\n", + "print(f\"\\n📝 TEXT STATISTICS:\")\n", + "print(f\"Average text length: {train_data['text_length'].mean():.0f} characters\")\n", + "print(f\"Average word count: {train_data['word_count'].mean():.0f} words\")\n", + "print(f\"Average title length: {train_data['title_length'].mean():.0f} characters\")\n", + "print(f\"Average title word count: {train_data['title_word_count'].mean():.0f} words\")\n", + "\n", + "print(f\"\\n📅 DATE RANGE:\")\n", + "if 'date_parsed' in train_data.columns and train_data['date_parsed'].notna().any():\n", + " valid_dates = train_data['date_parsed'].dropna()\n", + " print(f\"From: {valid_dates.min()}\")\n", + " print(f\"To: {valid_dates.max()}\")\n", + " print(f\"Span: {(valid_dates.max() - valid_dates.min()).days} days\")\n", + " print(f\"Successfully parsed: {len(valid_dates):,} ({len(valid_dates)/len(train_data)*100:.1f}%)\")\n", + "else:\n", + " print(\"Date parsing not available\")\n", + "\n", + "print(f\"\\n🏷️ SUBJECTS:\")\n", + "for subject, count in train_data['subject'].value_counts().items():\n", + " print(f\"{subject}: {count:,} articles\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['label', 'title', 'text', 'subject', 'date', 'date_clean',\n", + " 'date_parsed', 'text_length', 'title_length', 'word_count',\n", + " 'title_word_count'],\n", + " dtype='object')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAJFCAYAAADqEKf2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAP3hJREFUeJzt3QmYFNW9N/4f+yICbqwScYuoKCK8enFDDYKiGBJzJcQEJIgr9yrEDZOAxihxQ4yiqBGX96rgFhPFSCJxuSpxAfVGo7gggagsagRlX+b/nHr/M5eBgUIyM83MfD7PU9BdXdV9uqdrur9zzvlVraKioqIAAABgo2pv/CYAAAASwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCWAr1b59+zj11FML3Ywa75lnnolatWpl/xfiPXDCCSdU+uMCsCHBCaCSffDBB3HGGWfEbrvtFg0bNoymTZvGoYceGjfccEMsW7as0M2rsW6++ea46667Ct0MALZSdQvdAICaZPLkyfHv//7v0aBBgxgwYEB07NgxVq5cGc8//3xccMEF8dZbb8Vtt91W6GbW2OC04447btDLd8QRR2SBtn79+gVrGwCFJzgBVJIPP/wwvv/978cuu+wSf/7zn6N169Ylt51zzjnx/vvvZ8Gqulq7dm0WElMv29Zk6dKl0bhx443eXrt27a2uzdXV1voeAUgM1QOoJFdffXV89dVXcccdd5QKTcX22GOPOPfccze6/+effx7nn39+7LffftGkSZNsiN9xxx0Xb7zxxgbb3njjjbHvvvtmgWC77baLrl27xn333Vdy+5dffhnnnXdeNocm9X61aNEijjnmmJgxY8Ymn8Oll16azfd555134uSTT87asMMOO2TtXr58ealt03ZDhw6Ne++9N2tLepwnn3wyu+21117L2p72T8/lW9/6VvzlL38ptX8aNpfu47nnnsuGNqbHSdunnrp//vOfZfYYFT9OmzZtsjD6xRdflNrmyCOPzHr5pk+fnvUkpdfnkksuyV6H1Nv37LPPZo+ZlrTtpuY4Pfjgg9GlS5do1KhR1lP1wx/+MD766KNS26Teq/T80vq+fftml3faaafs57hmzZrYXH/84x/jgAMOyALFPvvsE4888kjJbbNmzcrad/3112+w34svvpjddv/995d5v+n9uM0225T5vvvHP/4RderUidGjR5esS69net+0a9cue53Te/aqq67KAs+6rr322jjkkEOyn1l6fdLr9NBDD23wGJt6jwBsbfQ4AVSSxx57LJvXlL5Qbon0BfnRRx/NhvrtuuuuMX/+/Lj11luje/fu8be//S0LC8ntt98e//mf/xnf+973SgLN//zP/8RLL70UP/jBD7JtzjzzzOyLbPrSmr6If/bZZ9lwwbfffjsOPPDA3Lak0JTCRvpSnQLPr3/96yzM3HPPPaW2Sz1rDzzwQPY4KVwUB5TDDz88C0EXXnhh1KtXL3seKaik4HLwwQeXuo+0b/PmzbPQNnPmzLjlllvi73//e0mgSdJtl112WfTo0SPOOuusku1eeeWVeOGFF7LHKJaeawptqfcvhZ2WLVtmj/0f//EfWbD56U9/mm2X1m9MCnWDBg2K//N//k/2GqSfRZqjlh4rhcLU3mIpIPXq1St7XilQPPXUU3HdddfF7rvvnrU1z3vvvRf9+vXLfmYDBw6MO++8M3sPpICRwm56T6U5cil8DBs2rNS+ad22224b3/72t8u87/R8v/Od78SkSZNizJgxWVAqlsJWUVFRnHLKKSU9c+m9lkJgCrLf+MY3smA2YsSI+OSTT2Ls2LEl+6bX4sQTT8z2TT1IEydOzNr8+OOPx/HHH5/7HgHYKhUBUOEWLVpUlH7lfvvb397sfXbZZZeigQMHllxfvnx50Zo1a0pt8+GHHxY1aNCg6Be/+EXJuvQY++677ybvu1mzZkXnnHNO0dc1atSo7HmceOKJpdafffbZ2fo33nijZF26Xrt27aK33nqr1LZ9+/Ytql+/ftEHH3xQsu7jjz8u2nbbbYuOOOKIknV33nlndh9dunQpWrlyZcn6q6++Olv/u9/9Lru+YMGC7P569uxZ6vW56aabsu0mTJhQsq579+7ZuvHjx2/w3NJrlm5f39NPP53tk/5PUltatGhR1LFjx6Jly5aVbPf4449n240cObJkXfr5pXXr/nySzp07Z89rc94Daf+HH3641HupdevW2X0Uu/XWW7Pt3n777ZJ1qZ077rhjqfdQWaZMmZLt+4c//KHU+v3337/U63H55ZcXbbPNNkXvvvtuqe0uvvjiojp16hTNmTOnZN3SpUtLbZPakl6vo48+utT6jb1HALZGhuoBVILFixdn/6e//m+pNIwpzbcp7sVIPSepx2CvvfYqNcQu9XakYVapt2Vj0japB+rjjz/eorakYXDrSr01yRNPPFFqfeqhSD1axVK707CzNGwt9ZQUS0MXU29Y6vUqfq2KnX766aV6jFIvTd26dUseK/XgpF6NNISs+PVJhgwZkvVqrT9vLL2OqbdoS7366quxYMGCOPvss0vNxUk9KR06dChznlrqLVpX6nFLPYibI/Ukpl6hYsXDFVPP1rx580p6AFNbUg9TsSlTpsSnn36a9aptSuqlS4+x7r5vvvlm1ku57r5paGJqdxr6me63eEn7p59rGlJZLA3PK5Z6IhctWpTtW9ZQ0PXfIwBbK8EJoBKkL7vFc4u2VJpHkuax7LnnntmX/zSsKc2XSV9w0xfTYhdddFEWqA466KBs2xRy0hCy9edbpS/Haa5K2i4NddvcL/JJut91pWFnKbTMnj271Po0pHBdCxcuzIZ8pbC3vr333jt7jnPnzt3kY6XnloJW8WOlYXvJ+veZquClcFZ8e7G2bdv+SxXyNvZ4SQpO6z9eCjTp57SuFD7KmqdVljSPqHhIYrFvfvOb2f/Fr0EKwn369Ck1jy0FofRcjz766E3ef/q5pSF1aRho+tkU75vanYbXrTtkMA0PTM9l3SUFpySFyWJpSN6//du/Zfex/fbbZ9uloZPrvk839h4B2FoJTgCVFJzSX/VTWNlSV155ZQwfPjwravBf//VfWY/Cn/70p2xS/bqT81MASXN80rySww47LB5++OHs/1GjRpVsk3ooUlBKRSRSu6655prsfv7whz9sUdvW/2JfVs/D1qKy27TuvKGKlHqh0s80zTtKAf33v/999O/fv1Qv3Kb2TYUiUnhKI+hSAEsn3m3WrFnJNuk9luZUpfdcWctJJ52Ubfff//3f2fymFJpSwY7UM5huTz2K/2903tb/HgEoi+IQAJUkfRFN52iaNm1adOvW7Wvvn4o5HHXUUVlVvnWlSmep92ldqVJaKiiQljSM7bvf/W5cccUV2UT+4uFlqdcmDTdLS+otSEUh0japcEKe1Puwbk9BKqWevljnTexPPQ+pkl0KdutLlfrSl/zUC7b+Y6XnXSx9wU/FCHr37p1dT+Xdk3Sf6w7/S887lYAv7hHZ0vC3vnUfb/3enLSu+Pbykl7bFDjWbd+7776b/b/u633sscdmr2/qLUqFKFLv0Y9+9KPNeoxUabBz587ZvjvvvHPMmTMnC9Xr9yqm1z7v9UxBPb3HUrBPPaPFUlELgKpMjxNAJUkV5FKgOe2007IqbOv74IMPsmpkm+q5WP8v9mneyfolsNPcp3WlYWlpDknad9WqVdl8lPWHTKVy5KnnacWKFZv1XMaNG1fqevGX7LzQlZ5Dz54943e/+12pYX3p9Ui9HKlnrHhYY7EUNlO7i6UhX6tXry55rPRFPj3HVNlv3dcnBcz0PNev4rYx6WezfvnysqTS7un1Gj9+fKnXK/XWpaqEm/t4myvNQ/vtb39bcj3NAUvVC1N58latWpWsT/O+Ug9TqlCXqv6lsvX777//Zj9OCllp/lmqjpfKiK//s0y9lCn0p0C0vvS6pZ9J8c84hbx1y62nn3XqzQKoyvQ4AVSS9Bf7FA5SL1AaTpeGR6W/9KeekTS8KoWgdN6fTfVY/eIXv8gKG6SS5n/961+zHoJ1e1mSFEzSF+pUojqV1E5f5m+66absC30qTpG+5KZehVSuvFOnTtmcoVRgIRWTSGWyN0fqyUnDsVIvR/oynYYOpqFY6f7y/PKXv8yGbqWQlHq70hf+VI48hZA092p96fVJ53lKX9xTj04a/pX2TY+fpF6W1JOWypGn9qT1xdulcuF5xRGKpXMNpVCW2pfmFaVwVNb8oFSoIp27KP0cUmGDFFaKy5GnHqD1S4L/q9J8psGDB2c/n/TznDBhQvZ4ZfXgpPdUCpBPP/101savI/38UrhPIS0V4Fi3IEdywQUXZMP/0vswvU/T67VkyZLsfZh6Q1M4Sj2f6X2WSpunn0W6z9SbmYJ2ek3TfDyAKqvQZf0AappUznnIkCFF7du3z8popzLchx56aNGNN96YlRzfVDnyn/zkJ1kp6kaNGmX7TJs2LSsZvW7Z6FSaOpX13mGHHbJS5bvvvnvRBRdckJWxTlasWJFd79SpU/bYqcR0unzzzTdvdjnyv/3tb0Xf+973sv232267oqFDh5YqzZ2k7TZW8nzGjBlFvXr1KmrSpElR48aNi4466qiiF198sdQ2xeXIn3322aLTTz89e5y0/SmnnFL02WefbXCfqfx4hw4diurVq1fUsmXLorPOOqvon//8Z6lt0uu0sVLt8+bNKzr++OOz55Qet/g1Xb8cebFJkyZlJcHTa7z99ttn7frHP/5Rapv080uv78ZexzzpPZDalEqGp/Lg6bHSc3zwwQc3uk96fqnE9/pt2Ry9e/fO2rX+z6LYl19+WTRixIiiPfbYI3vvpnLnhxxySNG1115bqmT8HXfcUbTnnnuWtDf9LMt6zpt6jwBsbWqlfwod3gCoGopPNJuq460/r6q8FZ9kNvW0pOFxbJ40VylVsps6derX3jeVPU89SGleFQClmeMEANVEOsfU66+/ng3Z+7pSwY10DqrNLSgBUNOY4wQAVVwqcz99+vRsjlqqlpjm0W2uNF8tnefrN7/5TTav6YwzzqjQtgJUVXqcAKCKS8UZ0rDGVH3w/vvvLyk5vzmeffbZrJcpBai77767VKU+AP6XOU4AAAA59DgBAADkqHFznNKZ7dPJBNO5TDb3LPEAAED1kwbfffnll9lJ4GvX3nSfUo0LTik0tWvXrtDNAAAAthJz587NTg6/KTUuOKWepuIXp2nTpoVuDgAAUCCLFy/OOlWKM8Km1LjgVDw8L4UmwQkAAKi1GVN4FIcAAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAsDUHp+eeey769OkTbdq0iVq1asWjjz6au88zzzwTBx54YDRo0CD22GOPuOuuuyqlrQAAQM1V0OC0ZMmS6NSpU4wbN26ztv/www/j+OOPj6OOOipef/31OO+88+K0006LKVOmVHhbAQCAmqtuIR/8uOOOy5bNNX78+Nh1113juuuuy67vvffe8fzzz8f1118fvXr1KnOfFStWZEuxxYsXl0PLAQCAmqSgwenrmjZtWvTo0aPUuhSYUs/TxowePTouu+yyqG7aXzy50E2o8Wb/6vhCN6Fmu7RZoVvApYsK3QIcB4XnOCg434kKb3YN+U5UpYpDzJs3L1q2bFlqXbqeepGWLVtW5j4jRoyIRYsWlSxz586tpNYCAADVRZXqcdoSqYhEWgAAAGpEj1OrVq1i/vz5pdal602bNo1GjRoVrF0AAED1VqWCU7du3WLq1Kml1v3pT3/K1gMAAFTL4PTVV19lZcXTUlxuPF2eM2dOyfykAQMGlGx/5plnxqxZs+LCCy+Md955J26++eZ44IEHYtiwYQV7DgAAQPVX0OD06quvRufOnbMlGT58eHZ55MiR2fVPPvmkJEQlqRT55MmTs16mdP6nVJb8N7/5zUZLkQMAAFT54hBHHnlkFBUVbfT2u+66q8x9XnvttQpuGQAAQBWd4wQAAFAIghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAANjag9O4ceOiffv20bBhwzj44IPj5Zdf3uT2Y8eOjb322isaNWoU7dq1i2HDhsXy5csrrb0AAEDNU9DgNGnSpBg+fHiMGjUqZsyYEZ06dYpevXrFggULytz+vvvui4svvjjb/u2334477rgju49LLrmk0tsOAADUHAUNTmPGjIkhQ4bEoEGDYp999onx48dH48aNY8KECWVu/+KLL8ahhx4aP/jBD7Jeqp49e0b//v032Uu1YsWKWLx4cakFAACgSgSnlStXxvTp06NHjx7/25jatbPr06ZNK3OfQw45JNunOCjNmjUrnnjiiejdu/dGH2f06NHRrFmzkiUN7wMAAPg66kaBfPrpp7FmzZpo2bJlqfXp+jvvvFPmPqmnKe132GGHRVFRUaxevTrOPPPMTQ7VGzFiRDYcsFjqcRKeAACAKlUc4ut45pln4sorr4ybb745mxP1yCOPxOTJk+Pyyy/f6D4NGjSIpk2blloAAACqRI/TjjvuGHXq1In58+eXWp+ut2rVqsx9fv7zn8ePfvSjOO2007Lr++23XyxZsiROP/30+OlPf5oN9QMAAChvBUsa9evXjy5dusTUqVNL1q1duza73q1btzL3Wbp06QbhKIWvJA3dAwAAqFY9TkmaezRw4MDo2rVrHHTQQdk5mlIPUqqylwwYMCDatm2bFXhI+vTpk1Xi69y5c3bOp/fffz/rhUrriwMUAABAtQpO/fr1i4ULF8bIkSNj3rx5ccABB8STTz5ZUjBizpw5pXqYfvazn0WtWrWy/z/66KPYaaedstB0xRVXFPBZAAAA1V1Bg1MydOjQbNlYMYh11a1bNzv5bVoAAAAqi2oKAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAActTN2wBga9R++X2FbkKNN7vQDQCASqTHCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAAbO3Bady4cdG+ffto2LBhHHzwwfHyyy9vcvsvvvgizjnnnGjdunU0aNAgvvnNb8YTTzxRae0FAABqnrqFfPBJkybF8OHDY/z48VloGjt2bPTq1StmzpwZLVq02GD7lStXxjHHHJPd9tBDD0Xbtm3j73//ezRv3rwg7QcAAGqGgganMWPGxJAhQ2LQoEHZ9RSgJk+eHBMmTIiLL754g+3T+s8//zxefPHFqFevXrYu9VZtyooVK7Kl2OLFi8v9eQAAANVbwYbqpd6j6dOnR48ePf63MbVrZ9enTZtW5j6///3vo1u3btlQvZYtW0bHjh3jyiuvjDVr1mz0cUaPHh3NmjUrWdq1a1chzwcAAKi+ChacPv300yzwpAC0rnR93rx5Ze4za9asbIhe2i/Na/r5z38e1113Xfzyl7/c6OOMGDEiFi1aVLLMnTu33J8LAABQvW3RUL0UXO66666YOnVqLFiwINauXVvq9j//+c9REdLjpPlNt912W9SpUye6dOkSH330UVxzzTUxatSoMvdJBSTSAgAAUKnB6dxzz82C0/HHH58Nl6tVq9bXvo8dd9wxCz/z588vtT5db9WqVZn7pEp6aW5T2q/Y3nvvnfVQpaF/9evX34JnAwAAUAHBaeLEifHAAw9E7969Y0ulkJN6jFKvVd++fUt6lNL1oUOHlrnPoYceGvfdd1+2XZoPlbz77rtZoBKaAACArWqOUwope+yxx7/84KkU+e233x533313vP3223HWWWfFkiVLSqrsDRgwIJujVCzdnqrqpR6vFJhSBb5UHCIViwAAANiqepx+8pOfxA033BA33XTTFg3TK9avX79YuHBhjBw5Mhtud8ABB8STTz5ZUjBizpw5JT1LSaqIN2XKlBg2bFjsv//+2XmcUoi66KKLtrgNAAAAFRKcnn/++Xj66afjD3/4Q+y7774l51Qq9sgjj2z2faVheRsbmvfMM89ssC6VI//LX/6yBa0GAACoxODUvHnz+M53vrOFDwkAAFADgtOdd95Z/i0BAACoTsGpWJqfNHPmzOzyXnvtFTvttFN5tQsAAKBqV9VLle9+/OMfZ2XAjzjiiGxp06ZNDB48OJYuXVr+rQQAAKhqwSmVEX/22Wfjscceiy+++CJbfve732XrUsU9AACAqOlD9R5++OF46KGH4sgjjyxZl06G26hRozj55JPjlltuKc82AgAAVL0epzQcr/hcS+tq0aKFoXoAAEC1s0XBKZ1LadSoUbF8+fKSdcuWLYvLLrssuw0AACBq+lC9G264IXr16hU777xzdOrUKVv3xhtvRMOGDWPKlCnl3UYAAICqF5w6duwY7733Xtx7773xzjvvZOv69+8fp5xySjbPCQAAoDrZ4vM4NW7cOIYMGVK+rQEAAKjKwen3v/99HHfccVGvXr3s8qaceOKJ5dE2AACAqhWc+vbtG/Pmzcsq56XLG1OrVq1Ys2ZNebUPAACg6gSntWvXlnkZAACgutuicuT33HNPrFixYoP1K1euzG4DAACImh6cBg0aFIsWLdpg/ZdffpndBgAAEDU9OBUVFWVzmdb3j3/8I5o1a1Ye7QIAAKia5cg7d+6cBaa0fOtb34q6df9391QQ4sMPP4xjjz22ItoJAABQNYJTcTW9119/PXr16hVNmjQpua1+/frRvn37OOmkk8q/lQAAAFUlOI0aNSrrWUoBqWfPntG6deuKaxkAAEBVneNUp06dOOOMM2L58uUV0yIAAIDqUByiY8eOMWvWrPJvDQAAQHUJTr/85S/j/PPPj8cffzw++eSTWLx4cakFAACgxs5xKta7d+/s/xNPPLFUWfLiMuVpHhQAAECNDk5PP/10+bcEAACgOgWn7t27l39LAAAAqlNwSr744ou444474u23386u77vvvvHjH/84mjVrVp7tAwAAqJrFIV599dXYfffd4/rrr4/PP/88W8aMGZOtmzFjRvm3EgAAoKr1OA0bNiwrDHH77bdH3br/7y5Wr14dp512Wpx33nnx3HPPlXc7AQAAqlZwSj1O64am7I7q1o0LL7wwunbtWp7tAwAAqJpD9Zo2bRpz5szZYP3cuXNj2223LY92AQAAVO3g1K9fvxg8eHBMmjQpC0tpmThxYjZUr3///uXfSgAAgKo2VO/aa6/NTnQ7YMCAbG5TUq9evTjrrLPiV7/6VXm3EQAAoOoFp/r168cNN9wQo0ePjg8++CBblyrqNW7cuLzbBwAAUHBbfB6nJAWl5s2bl1wGAACojrZojlManvfzn/88O9lt+/btsyVd/tnPfharVq0q/1YCAABUtR6n//iP/4hHHnkkrr766ujWrVu2btq0aXHppZfGZ599Frfcckt5txMAAKBqBaf77rsvq6J33HHHlazbf//9o127dllVPcEJAACImj5Ur0GDBtnwvPXtuuuuWeEIAACAqOnBaejQoXH55ZfHihUrStaly1dccUV2GwAAQNT0oXqvvfZaTJ06NXbeeefo1KlTtu6NN96IlStXxre+9a347ne/W7JtmgsFAABQ44JTKkF+0kknlVqX5jcBAABUR1sUnO68887ybwkAAEB1PAHuwoULY+bMmdnlvfbaK3baaafyahcAAEDVLg6xZMmS+PGPfxytW7eOI444IlvatGkTgwcPjqVLl5Z/KwEAAKpacBo+fHg8++yz8dhjj8UXX3yRLb/73e+ydT/5yU/Kv5UAAABVbajeww8/HA899FAceeSRJet69+4djRo1ipNPPtkJcAEAgGpli3qc0nC8li1bbrC+RYsWhuoBAADVzhYFp27dusWoUaNi+fLlJeuWLVsWl112WXYbAABA1PShemPHjo1jjz12gxPgNmzYMKZMmVLebQQAAKh6wWm//faL9957L+6999545513snX9+/ePU045JZvnBAAAUKOD06pVq6JDhw7x+OOPx5AhQyqmVQAAAFV5jlO9evVKzW0CAACo7raoOMQ555wTV111Vaxevbr8WwQAAFAd5ji98sorMXXq1PjjH/+YzXfaZpttSt3+yCOPlFf7AAAAqmZwat68eZx00knl3xoAAICqHpzWrl0b11xzTbz77ruxcuXKOProo+PSSy9VSQ8AAKjWvtYcpyuuuCIuueSSaNKkSbRt2zZ+/etfZ/OdAAAAqrOvFZzuueeeuPnmm7OT3D766KPx2GOPZedySj1RAAAA1dXXCk5z5syJ3r17l1zv0aNH1KpVKz7++OOKaBsAAEDVC06p/HjDhg03OK9TOikuAABAdfW1ikMUFRXFqaeeGg0aNChZl06Ge+aZZ5YqSa4cOQAAUGOD08CBAzdY98Mf/rA82wMAAFC1g9Odd95ZcS0BAACoDnOcAAAAaiLBCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAABUheA0bty4aN++fTRs2DAOPvjgePnllzdrv4kTJ0atWrWib9++Fd5GAACg5ip4cJo0aVIMHz48Ro0aFTNmzIhOnTpFr169YsGCBZvcb/bs2XH++efH4YcfXmltBQAAaqaCB6cxY8bEkCFDYtCgQbHPPvvE+PHjo3HjxjFhwoSN7rNmzZo45ZRT4rLLLovddtttk/e/YsWKWLx4cakFAACgygSnlStXxvTp06NHjx7/26DatbPr06ZN2+h+v/jFL6JFixYxePDg3McYPXp0NGvWrGRp165dubUfAACoGQoanD799NOs96hly5al1qfr8+bNK3Of559/Pu644464/fbbN+sxRowYEYsWLSpZ5s6dWy5tBwAAao66UYV8+eWX8aMf/SgLTTvuuONm7dOgQYNsAQAAqJLBKYWfOnXqxPz580utT9dbtWq1wfYffPBBVhSiT58+JevWrl2b/V+3bt2YOXNm7L777pXQcgAAoCYp6FC9+vXrR5cuXWLq1KmlglC63q1btw2279ChQ/z1r3+N119/vWQ58cQT46ijjsoum78EAABUy6F6qRT5wIEDo2vXrnHQQQfF2LFjY8mSJVmVvWTAgAHRtm3brMhDOs9Tx44dS+3fvHnz7P/11wMAAFSb4NSvX79YuHBhjBw5MisIccABB8STTz5ZUjBizpw5WaU9AACAGhuckqFDh2ZLWZ555plN7nvXXXdVUKsAAAD+H105AAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACBH3bwNAICtU/vl9xW6CTXe7EI3AKg0epwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAAVIXgNG7cuGjfvn00bNgwDj744Hj55Zc3uu3tt98ehx9+eGy33XbZ0qNHj01uDwAAUOWD06RJk2L48OExatSomDFjRnTq1Cl69eoVCxYsKHP7Z555Jvr37x9PP/10TJs2Ldq1axc9e/aMjz76qNLbDgAA1AwFD05jxoyJIUOGxKBBg2KfffaJ8ePHR+PGjWPChAllbn/vvffG2WefHQcccEB06NAhfvOb38TatWtj6tSpZW6/YsWKWLx4cakFAACgygSnlStXxvTp07PhdiUNql07u556kzbH0qVLY9WqVbH99tuXefvo0aOjWbNmJUvqoQIAAKgywenTTz+NNWvWRMuWLUutT9fnzZu3Wfdx0UUXRZs2bUqFr3WNGDEiFi1aVLLMnTu3XNoOAADUHHWjCvvVr34VEydOzOY9pcISZWnQoEG2AAAAVMngtOOOO0adOnVi/vz5pdan661atdrkvtdee20WnJ566qnYf//9K7ilAABATVbQoXr169ePLl26lCrsUFzooVu3bhvd7+qrr47LL788nnzyyejatWsltRYAAKipCj5UL5UiHzhwYBaADjrooBg7dmwsWbIkq7KXDBgwINq2bZsVeUiuuuqqGDlyZNx3333ZuZ+K50I1adIkWwAAAKpdcOrXr18sXLgwC0MpBKUy46knqbhgxJw5c7JKe8VuueWWrBrf9773vVL3k84Ddemll1Z6+wEAgOqv4MEpGTp0aLaUJRV+WNfs2bMrqVUAAABbyQlwAQAAtnaCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQI66eRsAwL9izZo1sWrVqkI3o8qpV69e1KlTp9DNAOD/JzgBUCGKiopi3rx58cUXXxS6KVVW8+bNo1WrVlGrVq1CNwWgxhOcAKgQxaGpRYsW0bhxY1/+v2boXLp0aSxYsCC73rp160I3CaDGE5wAqJDhecWhaYcddih0c6qkRo0aZf+n8JReR8P2AApLcQgAyl3xnKbU08SWK379zBEDKDzBCYAKY3jev8brB7D1EJwAAAByCE4AAAA5FIcAoFK1v3hypT3W7F8dX2mPddddd8V5552n/DpANaXHCQDWceqpp2Zzi9Zf3n///UI3DYAC0uMEAOs59thj48477yy1bqeddipYewAoPD1OALCeBg0aRKtWrUotN9xwQ+y3336xzTbbRLt27eLss8+Or776aqP3sXDhwujatWt85zvfiRUrVsTatWtj9OjRseuuu2bnaOrUqVM89NBDlfq8ANhyghMAbIbatWvHr3/963jrrbfi7rvvjj//+c9x4YUXlrnt3Llz4/DDD4+OHTtm4SgFsRSa7rnnnhg/fnx2H8OGDYsf/vCH8eyzz1b6cwHg6zNUDwDW8/jjj0eTJk1Krh933HHx4IMPllxv3759/PKXv4wzzzwzbr755lL7zpw5M4455pisp2ns2LHZ/KjU43TllVfGU089Fd26dcu222233eL555+PW2+9Nbp3716Jzw6ALSE4AcB6jjrqqLjllltKrqfheSn0pF6jd955JxYvXhyrV6+O5cuXx9KlS6Nx48bZdsuWLct6mn7wgx9koalYKiyRtkuBal0rV66Mzp07V+IzA2BLCU4AsJ4UlPbYY4+S67Nnz44TTjghzjrrrLjiiiti++23z3qLBg8enIWf4uCUhuT16NEj67G64IILom3bttn64rlQkydPLllXLO0DwNZPcAKAHNOnT8+KO1x33XXZXKfkgQce2GC7dNv//b//N+txSr1WzzzzTLRp0yb22WefLCDNmTPHsDyAKkpwAoAcqfdp1apVceONN0afPn3ihRdeyIo8lKVOnTpx7733Rv/+/ePoo4/OwlOqynf++ednBSFSADvssMNi0aJF2f00bdo0Bg4cWOnPCYCvR3ACoFLN/tXxUdWk0uFjxoyJq666KkaMGBFHHHFENt9pwIABZW5ft27duP/++6Nfv34l4enyyy/PzgWV9ps1a1Y0b948DjzwwLjkkksq/fkA8PXVKioqKooaJE3obdasWfaXvvRXvqqq/cWTC92EGq8qfvmrThwDW/cxkIomfPjhh9k5ixo2bFip7apO8l5Hx0Hh+SwoPMdB4c2uwsfB18kGzuMEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJCjbt4GAFCuLm1WiY+1KLZGp556anzxxRfx6KOPFropAGwmPU4AsF6oqVWrVrbUq1cvdt1117jwwgtj+fLlhW4aAAWkxwkA1nPsscfGnXfeGatWrYrp06fHwIEDsyB11VVXFbppABSIHicAWE+DBg2iVatW0a5du+jbt2/06NEj/vSnP2W3rV27NkaPHp31RDVq1Cg6deoUDz30UMm+a9asicGDB5fcvtdee8UNN9xQwGcDQHnQ4wQAm/Dmm2/Giy++GLvsskt2PYWm//qv/4rx48fHnnvuGc8991z88Ic/jJ122im6d++eBaudd945Hnzwwdhhhx2yfU8//fRo3bp1nHzyyYV+OgBsIcEJANbz+OOPR5MmTWL16tWxYsWKqF27dtx0003Z5SuvvDKeeuqp6NatW7btbrvtFs8//3zceuutWXBK86Iuu+yykvtKPU/Tpk2LBx54QHACqMIEJwBYz1FHHRW33HJLLFmyJK6//vqoW7dunHTSSfHWW2/F0qVL45hjjim1/cqVK6Nz584l18eNGxcTJkyIOXPmxLJly7LbDzjggAI8EwDKi+AEAOvZZpttYo899sgupwCU5jHdcccd0bFjx2zd5MmTo23bthvMi0omTpwY559/flx33XVZr9S2224b11xzTbz00ksFeCYAlBfBCQA2IQ3Tu+SSS2L48OHx7rvvZgEp9SSlYXlleeGFF+KQQw6Js88+u2TdBx98UIktBqAiqKoHADn+/d//PerUqZPNY0q9ScOGDYu77747C0QzZsyIG2+8MbuepIIRr776akyZMiULWj//+c/jlVdeKfRTAOBfpMcJgMp16aKoatIcp6FDh8bVV18dH374YVZBL1XXmzVrVjRv3jwOPPDArFcqOeOMM+K1116Lfv36Zed+6t+/f9b79Ic//KHQTwOAf0GtoqKioqhBFi9eHM2aNYtFixZF06ZNo6pqf/HkQjehxpv9q+ML3YQazTGwdR8Dy5cvzwJGqijXsGHDSm1XdZL3OjoOCs9nQeE5DgpvdhU+Dr5ONjBUDwAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAKgwa9euLXQTqjSvH8DWQzlyAMpd/fr1sxPHfvzxx1np7nQ9leZm86SCtytXroyFCxdmr2N6/QAoLMEJgHKXvuynEtqffPJJFp7YMo0bN45vfOMb2esJQGEJTgBUiNRLkr70r169OtasWVPo5lQ5derUyU68q6cOYOsgOAFQYdKX/nr16mULAFRl+v4BAACqQnAaN25ctG/fPho2bBgHH3xwvPzyy5vc/sEHH4wOHTpk2++3337xxBNPVFpbAQCAmqfgwWnSpEkxfPjwGDVqVMyYMSM6deoUvXr1igULFpS5/Ysvvhj9+/ePwYMHx2uvvRZ9+/bNljfffLPS2w4AANQMBZ/jNGbMmBgyZEgMGjQouz5+/PiYPHlyTJgwIS6++OINtr/hhhvi2GOPjQsuuCC7fvnll8ef/vSnuOmmm7J917dixYpsKbZo0aLs/8WLF0dVtnbF0kI3ocar6u+hqs4xUHiOgcJzHBSe46DwHAeFt7gKHwfFbU+ngchVVEArVqwoqlOnTtFvf/vbUusHDBhQdOKJJ5a5T7t27Yquv/76UutGjhxZtP/++5e5/ahRo9KrYLFYLBaLxWKxWCxFZS1z587NzS4F7XH69NNPsxK1LVu2LLU+XX/nnXfK3GfevHllbp/Wl2XEiBHZUMB1z8L++eefxw477KDEawGTfbt27WLu3LnRtGnTQjcHCsJxAI4DcAwUXupp+vLLL6NNmzZb/1C9itagQYNsWVfz5s0L1h7+V/oF4ZcENZ3jABwH4BgorGbNmm39xSF23HHH7AR/8+fPL7U+XW/VqlWZ+6T1X2d7AACAf1XtQp9VvkuXLjF16tRSQ+nS9W7dupW5T1q/7vZJKg6xse0BAAD+VQUfqpfmHw0cODC6du0aBx10UIwdOzaWLFlSUmVvwIAB0bZt2xg9enR2/dxzz43u3bvHddddF8cff3xMnDgxXn311bjtttsK/EzYXGnoZCo/v/4QSqhJHAfgOADHQNVSK1WIKHQjUinxa665JivwcMABB8Svf/3r7ES4yZFHHpmdHPeuu+4qdQLcn/3sZzF79uzYc8894+qrr47evXsX8BkAAADV2VYRnAAAALZmBZ3jBAAAUBUITgAAADkEJwAAgByCEwAAQA7BCQAAYGs/jxM1w+rVq+Ott97KSs4nrVq1in322Sfq1atX6KZBpUnv/5deeqnUcZBOvZD+h5rC5wE1nc+CqktwokKtXbs2Ro4cGePGjYtFixaVuq1Zs2YxdOjQuOyyy6J2bZ2fVF/ppN5nnHFGdsLuWrVqxfbbb5+t//zzzyOdEaJ///5x6623RuPGjQvdVKgwPg+o6XwWVH1+O1GhLr744rjtttviV7/6VcyaNSv7pZGWdPmqq67KbhsxYkShmwkV6txzz42XX345Jk+eHMuXL4/58+dnS7r8xBNPZLelbaA683lATeezoOpzAlwqVOp2vvvuu6NXr15l3j5lypQYMGBA9osDqqvtttsu+6A85JBDyrz9hRdeiBNOOCH++c9/VnrboLL4PKCm81lQ9elxokJ9+eWX0aZNm43e3rp16+wvjlDdhyjVr19/o7en29I2UJ35PKCm81lQ9QlOVKgjjzwyzj///Pj00083uC2tu+iii7JtoDpLf0E8/fTT47XXXtvgtrTurLPOij59+hSkbVBZfB5Q0/ksqPoM1aNCzZ07N3r37h3vvPNO7LffftGyZctsfRqK8de//jWrpPT4449Hu3btCt1UqDBp2MUPfvCDbChSGqrRokWLbP2CBQviiy++yIYu3XfffdG8efNCNxUqjM8DajqfBVWf4ESFS93O6ZfEX/7yl1KlN7t16xY9e/ZUQYkaI31hnDZt2gbHQYcOHQrdNKgUPg/AZ0FVJjgBAADk8KcdgAL75JNPYs6cOYVuBgAF5LNg6yc4UVB777131KlTp9DNgII6+uijY9dddy10M6CgfB5Q0/ks2PrVLXQDqNlGjx69wRnkoaa55557YunSpYVuBhSUzwNqOp8FWz9znAAAAHLocQIogPfeey8by77LLrvEHnvsUejmAFBJ1qxZU2pY6ssvv5xVnOzcuXM0aNCgoG1j08xxokKlXwbpF0SxdI6O7t27R9u2baNr165ZtzTUhCFIU6dOLTmPR48ePWKvvfaKY445Jvv/uOOOy87hAdXZtttuG4MHD44XX3yx0E2Bgvj73/+effdJ4Sj93l+8eHH2OfBv//Zvccghh2TnMnv33XcL3Uw2QXCiQqXzEnz22WfZ5cceeyy+/e1vR/v27eOnP/1p9peV9CH629/+ttDNhAp18803x/bbb59dvvDCC+Pzzz+P6dOnZ2PZZ8yYkYWm888/v9DNhAq1ZMmSeOmll+Kwww7LCkFcd911sXDhwkI3CyrNT37yk2jSpEk8+uij0bRp0+yE0KtXr85ODv3RRx/FnnvuGRdddFGhm8kmmONEhUonM0wneEtnxz788MOzD8z01/diV155ZRao0ongoLpq2LBhzJw5MxuWlyom3X333XHEEUeU3J5CVJ8+feLjjz8uaDuhMj4PUsnl3/zmN3HffffFV199FSeccEKcdtppceyxx0atWrUK3UyoMOm70B//+Mc44IADskIo2223XTz33HPZd6Mk/SEthaniE+Oy9dHjRKVJ3c/f+973Sq076aSTsjNoQ3WWAtObb76ZXU5fDOvWLT29NI11T3+Nh5qgU6dOceONN2Z/KLjrrruyL5ApPH3jG9+IkSNHFrp5UGGWL18ezZo1Kxm6mn73p/+LpV4oVfW2boITFe5vf/tb/M///E80atQom/y4vtRNDdXZkCFD4oILLoj3338/hg4dmg3L++CDD7LbPvzwwxg2bFj07Nmz0M2ECrV+b1Ka59G/f/946qmnsuPh1FNPzYIUVFf77rtvTJgwIbucRh7ssMMOMXHixJLb77///vjmN79ZwBaSx1A9KnxoRvqwLH6bjRkzJs4777yS29MvjMsvvzzeeuutArYSKt5//ud/xvjx42P33XeP2bNnx8qVK7Oep/SHgwMPPDAbstqqVatCNxMqZej2xqTPCsP1qK6mTJkSffv2zf6InI6HdD39Ya158+bZ9VdeeSUbwnryyScXuqlshOBEhVeQWVeaFJn+wlKsuKregAEDKr1tUNnefvvtrLLkrFmzsg/O1q1bx6GHHppV2fNlkerusssuy3peGzduXOimQMGkP5ylea1dunTJimXNnz8/xo0blw3RO/744+Ooo44qdBPZBMEJAAAghzlOFFQappROAgo12apVqxwH1Hg+D6jpHANbP8GJgkpzm1J5ZqjpBVQcB9R0Pg+o6RwDWz/BCQAAIEfpk4lAOUvVwjZl2bJlldYWKBTHATgOwDFQ9QlOVPgQpO9///sb7XpOZ5BPJ8aF6sxxAI4DcAxUfYITFapjx45x8MEHx1lnnVXm7a+//nrcfvvtld4uqEyOA3AcgGOg6jPHiQqVzlEzc+bMjd6+7bbbxhFHHFGpbYLK5jgAxwE4Bqo+53ECAADIoccJAAAghzlOVIqXX345pk2bFvPmzcuut2rVKrp16xYHHXRQoZsGlcZxAI4DcAxUXYbqUaEWLFgQJ510UrzwwgvxjW98I1q2bJmtnz9/fnZ27DTe9+GHH44WLVoUuqlQYRwH4DgAx0DVZ6geFerss8+ONWvWxNtvvx2zZ8+Ol156KVvS5bRu7dq1cc455xS6mVChHAfgOADHQNWnx4kKlSrEPPfcc9G5c+cyb58+fXoceeSR8eWXX1Z626CyOA7AcQCOgapPjxMVqkGDBrF48eKN3p5+OaRtoDpzHIDjABwDVZ/gRIXq169fDBw4MH7729+W+mWRLqd1gwYNiv79+xe0jVDRHAfgOADHQNWnqh4VasyYMdmY3e9///uxevXqqF+/frZ+5cqVUbdu3Rg8eHBce+21hW4mVCjHATgOwDFQ9ZnjRKVIf01JY3fXLb3ZpUuXaNq0aaGbBpXGcQCOA3AMVF2CEwAAQA5znKhwy5Yti+effz7+9re/bXDb8uXL45577ilIu6AyOQ7AcQCOgapNjxMV6t13342ePXtmJ3arVatWHHbYYXH//fdHmzZtSk76li6n8xpAdeU4AMcBOAaqPj1OVKiLLrooOnbsmJ0te+bMmdk5DNIvivRLA2oKxwE4DsAxUPXpcaJCtWzZMp566qnYb7/9suvp7ZbOnP3EE0/E008/Hdtss42/rlDtOQ7AcQCOgapPjxMVPpY3ldgslrqmb7nllujTp090794967aG6s5xAI4DcAxUfc7jRIXq0KFDvPrqq7H33nuXWn/TTTdl/5944okFahlUHscBOA7AMVD16XGiQn3nO9/JJj6WJf2iSGfINlqU6s5xAI4DcAxUfeY4AQAA5NDjBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAKq0VBy2R48e0atXrw1uu/nmm6N58+bxj3/8oyBtA6D6EJwAqNJq1aoVd955Z7z00ktx6623lqz/8MMP48ILL4wbb7wxdt5553J9zFWrVpXr/QGw9ROcAKjy2rVrFzfccEOcf/75WWBKvVCDBw+Onj17RufOneO4446LJk2aRMuWLeNHP/pRfPrppyX7Pvnkk3HYYYdlPVM77LBDnHDCCfHBBx+U3D579uwsnE2aNCm6d+8eDRs2jHvvvbdAzxSAQnECXACqjb59+8aiRYviu9/9blx++eXx1ltvxb777hunnXZaDBgwIJYtWxYXXXRRrF69Ov785z9n+zz88MNZMNp///3jq6++ipEjR2Zh6fXXX4/atWtnl3fddddo3759XHfddVkQS+GpdevWhX66AFQiwQmAamPBggVZUPr888+zQPTmm2/Gf//3f8eUKVNKtknznVIP1cyZM+Ob3/zmBveReqN22mmn+Otf/xodO3YsCU5jx46Nc889t5KfEQBbC0P1AKg2WrRoEWeccUbsvffeWe/TG2+8EU8//XQ2TK946dChQ7Zt8XC89957L/r37x+77bZbNG3aNOtZSubMmVPqvrt27VqAZwTA1qJuoRsAAOWpbt262ZKkoXd9+vSJq666aoPtiofapdt32WWXuP3226NNmzaxdu3arKdp5cqVpbbfZpttKukZALA1EpwAqLYOPPDAbMhe6kUqDlPr+uyzz7Iheyk0HX744dm6559/vgAtBWBrZ6geANXWOeeck813SkPxXnnllWx4XprvNGjQoFizZk1st912WSW92267Ld5///2sYMTw4cML3WwAtkKCEwDVVhp698ILL2QhKZUm32+//eK8887LSo+ninlpmThxYkyfPj0bnjds2LC45pprCt1sALZCquoBAADk0OMEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAMSm/X8UCvxvBFPq4AAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAI0CAYAAAAqbnmSAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAARM5JREFUeJzt3Qe4FOW9P/AfvaiAioAgii02FBCifyTWoNjFawtRwYaxkKhYIhpFYkFNrBEl4YolkYhRYy9RFBOVXAui0dhiAwuIGsGI9P0/79znnMuBA4N4OMvZ/XyeZ56zOzO7887ssMx33zL1CoVCIQAAAFiq+ktfBAAAQCI4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBFEmnTp3iqKOOKnYxyt748eOjXr162d9inAP77rtvrW93VeaYAKsqwQmghr3zzjvxk5/8JDbaaKNo2rRptGjRInr16hXXXHNNfPPNN8UuXtm6/vrr4+abby52MYiIf/7zn3HBBRfE+++/X+yiACy3hsu/KgB5HnzwwTjkkEOiSZMm0b9//+jcuXPMnTs3nn766TjzzDPjtddei9/97nfFLmbZBqfWrVsvUcu30047ZYG2cePGRStbOQanYcOGxS677JLVMAHUBYITQA1577334kc/+lFssMEG8cQTT8S6665buezkk0+Of/3rX1mwKlULFy7MQmKqZVuVzJo1K5o3b77U5fXr11/lygzAqkdTPYAacvnll8d//vOfuPHGG6uEpgqbbLJJnHLKKUt9/RdffBFnnHFGbL311rH66qtnTfz22muvePnll5dY9ze/+U1stdVWWSBYc801o0ePHjFmzJjK5V999VWceuqp2a/5qfarTZs2sfvuu8fEiROXuQ+p+VTq7/PGG2/EoYcempVh7bXXzso9e/bsKuum9QYNGhS33XZbVpa0nUceeSRb9tJLL2VlT69P+/LDH/4w/v73v1d5fWo2l97jr3/9a9a0MW0nrZ9q6v79739XW2NUsZ327dtnYfTLL7+ssk6qwUi1fC+++GJWk5SOzznnnJMdh1Tb99RTT2XbTFNad1l9nP70pz9F9+7do1mzZllN1RFHHBEfffRRlXVS7VXavzS/b9++2eN11lkn+xwXLFgQy+svf/lLdO3aNQtwW265Zdx9992Vy959992sfFddddUSr3v22WezZX/84x+X+t4V+3fHHXdktTwdOnSINdZYIw4++OCYMWNGzJkzJztX0jmSyn/00Udn8xY1f/78uPDCC2PjjTfOjn86num4Lr5eRf+kVMO63XbbZfuTmqzeeuutVT73VCub7LrrrpWfx+LHf1nvAVAM9QqFQqEoWwYoMeutt152UZn6OC2PdJGZLt4r+t288MILWY1VuqjccMMNY9q0afHb3/42C2OpaVMKC8moUaPi+OOPzy58UxhKgeaVV16J1VZbLetHlRx++OFx5513ZsEmXYh//vnn2YXoYYcdli1bVnBKF9cpvKXy9enTJws8f/jDH+LII4+scvGaLna32GKL+Oyzz7LtpHCxww47RKNGjWL77bfPQtBJJ52UPU/78fHHH2fBJS1L0n6ni/S0rVatWmX7/eabb8YNN9wQP/jBDyov+BctV+/eveOAAw6oXG/bbbeNZ555JttGko5nWpZCSzqWKUS1bds20n91P/3pT7NgcO6552brpvnp+KXtpAv4J598sjJMVZTt+9//fvz4xz/OPot0bNNrUihM5a0ITmPHjs0CRdqvtP7jjz8ed911Vxb0TjzxxNxzIJ0zn376aZxwwglZeLnpppuykJdCaCpfko5H+pzTObKoFB5///vfx9SpU5daq1axfymYpRDYr1+/rPYzhe90PqQatxRUU+BJn3V6v3Sszz///Mr3SPt5yy23ZOdceq//+Z//yc6FFBb//Oc/V9mfFHRSoD322GOzc3b06NHZMfvHP/6RBd8UBNOxvPbaa7Pwlc6hJO1rOr7L8x4ARZGCEwDfzYwZM9KPUIUDDjhguV+zwQYbFAYMGFD5fPbs2YUFCxZUWee9994rNGnSpPDLX/6ycl7axlZbbbXM927ZsmXh5JNPLnxbQ4cOzfZj//33rzL/pJNOyua//PLLlfPS8/r16xdee+21Kuv27du30Lhx48I777xTOe/jjz8urLHGGoWddtqpct5NN92UvUf37t0Lc+fOrZx/+eWXZ/Pvvffe7Pmnn36avd8ee+xR5fhcd9112XqjR4+unLfzzjtn80aOHLnEvqVjlpYv7sknn8xek/4mqSxt2rQpdO7cufDNN99UrvfAAw9k651//vmV89Lnl+Yt+vkk3bp1y/Zrec6B9Pq77rqryrm07rrrZu9R4be//W223uuvv145L5WzdevWVc6h6lTsX9qfRY9zv379CvXq1SvstddeVdbv2bNnVq4KkyZNyl5/3HHHVVnvjDPOyOY/8cQTS+zPX//618p56fNL5/Dpp59eOe9Pf/pTlWNe3THJew+A2qapHkANmDlzZvY3NYFaUanmIf36n6Qak1RLlGpINttssypN7FJtx4cffhjPP//8Ut8rrZNqBVItz4pINRmLSrU1yUMPPVRl/s4775zVaFVI5U7NzlJNRGpeVSE1XUw1N6nWq+JYVUi1ZxU1RkmqpWnYsGHltlINTuo7lZqTVRyfZODAgVmt1uL9xtJxTLVFKyrV6qQaoFRbtmjfp3322Sc233zzavuppdqiRe24445ZzcrySDUqBx54YOXziuaKqYYl1SQlqdlkKktqFlnh0UcfzWr7UhPC5ZHec9HjnGrIUv495phjqqyX5k+ZMiVrnpdUfA6DBw+ust7pp5+e/V38eKTzIe1/hdR0MZ3Dy3s8auo9AGqa4ARQA9LFbkXfou8yuELqx7LppptmF/+p6Vu6YEzN8FJflAo///nPs0CV+n+kdVPISc3VFu9v9eqrr0bHjh2z9VJTt29z0Zned1GpKVoKLYsPH52aFC5q+vTp2WAM6SJ3calJVtrHdFG+rG2lfUtBq2JbH3zwQfZ38fdMo+ClcFaxvELqw/NdRshb2vaSFJwW314KNOlzWlTqd1ZdP63qpL5vFU0SK3zve9/L/lYcgxSE99tvvyr92FKISvu62267Ldd21l9//SrPW7Zsmf1N58ji89PnVHHOpf1Nn30q56LatWuXlWvx47H4dr7t8aip9wCoaYITQA0Fp1RzkMLKirrkkkuyX/XToAapT1GqUXjssceyPh3pQnbRAJL68dx+++1Z35fUnyb9HTp0aOU6qYYiBaXUjyWV61e/+lX2Pg8//PAKlW3xC/sKqc/Mqqa2y9SgQYNa2U6qMUqfaRoQIgX0++67L+uvtGgt3IqUc2nzF+8CvbRzYEXfb2W/B0BNE5wAakjqXJ8GhpgwYcIKvT4N5pA63qdR+dLABnvssUc2GMLiI8claSCI1LE/DSQwefLkrBnZxRdfXGXku1Rrk5qb3XPPPdlQ6WnUurTO8nj77berPE+DCaTwlnfPnVTzkgYpSMFucWmkvnSRv3gNx+LbSoNhfPLJJ5XbSsO7J4u/Z2q+l/arYnme5b3wX9r2KuYt7/aWVzq2iweCt956K/u76PHec889s+ObaprSgAypZi8N2LGypf1Nn/3in1MaMCOdmytyPJb3swBYlQhOADXkrLPOygLNcccdl11ULi6FqopR75b2K/viF9BpSOzFh8BOfZ8WlZqlpT4h6bXz5s3L+hkt2rQvSaO1pZqnxYePXpoRI0ZUeZ5qrpI0xPiypH1Ige/ee++t0qwvHY/UzCzVjFU0a6yQbgicyl0hjZaX+tdUbCuFx7SPaRS2RY9PCphpP1NoXB7ps6kuhC4uDe2ejtfIkSOrHK9UW/f6668v9/aWV+qHtujIdKkPWBqxLo2Cl5rDVUj9vlINUxpWPI36l0Yj3GabbWJl23vvvbO/V199dZX5V155ZfZ3RY5H+iyS5fk8AFYVboALUENSP6AUDlJNUGpOl5pWpeGwU81Ial6VQlAa1nlZNVa//OUvs4EN0rDeaejlVLuw6CALSQom6YK6V69e2fDN6WL+uuuuyy5g0+AU6WI0DY2eho7u0qVL1mcoDbCQBpO44oorlmtfUk3O/vvvn9VypBq01HQwDe6Q3i/PRRddlDUxTCEp1XilC/40HHkKIanv1eLS8Un3eUrNC1ONThrGO702bT9JtSxDhgzJhshO5UnzK9ZLw38v7+AI6Z5MKZSl8qX+OikcVdc/KA2gcNlll2WfQxr8IoWViuHIUw3QaaedFjUp9WdKw26nzyd9nmno7bS9VJu4uHROpQCZhk5PZawN6TMfMGBAFnDTuZWOyXPPPZcNT54GAUm1pN9WCoUpZKd9SOE39elLn0X6TABWWbU+jh9AiXvrrbcKAwcOLHTq1CkbRjsNw92rV6/Cb37zm2zI8WUNR56GW05DUTdr1ix7zYQJE7IhtBcdRjsNTZ2G9V577bWzIZo33njjwplnnpkNY53MmTMne96lS5ds26uttlr2+Prrr1/u4cj/+c9/Fg4++ODs9WuuuWZh0KBBVYbmTtJ6SxvyfOLEiYU+ffoUVl999ULz5s0Lu+66a+HZZ5+tsk7FcORPPfVU4fjjj8+2k9Y//PDDC59//vkS75mGH998880LjRo1KrRt27Zw4oknFv79739XWScdp6UN1T516tTCPvvsk+1T2m7FMV18OPIKY8eOzYYET8d4rbXWysr14YcfVlknfX7p+C7tOOZJ50Aq06OPPlrYZpttsm2lfUzDdS9N2r80DPziZVmaiv1b/D0rjv/zzz9fbdmnT59eOW/evHmFYcOGFTbccMPs+Hfs2LEwZMiQKufzovuzuMXP4WTUqFGFjTbaqNCgQYMqx//bvAdAbXIDXAAqVdxoNo2Ol0b1W5kqbjKbalpS8ziWT7du3WKttdaKcePGFbsoAGVFHycAqCPSPaYmTZqUNdkDoHbp4wQAq7g0zP2LL76Y9VFLoyWmfnQA1C41TgCwiktD1admjWn0wT/+8Y/ZTXcBqF36OAEAAORQ4wQAAJBDcAIAAMhRdoNDLFy4MLtLe7pJZL169YpdHAAAoEhSr6Wvvvoq2rdvH/XrL7tOqeyCUwpNHTt2LHYxAACAVcSUKVNivfXWW+Y6ZRecUk1TxcFp0aJFsYsDAAAUycyZM7NKlYqMsCxlF5wqmuel0CQ4AQAA9ZajC4/BIQAAAHIITgAAADkEJwAAgBxl18cJAADqogULFsS8efOKXYw6p3HjxrlDjS8PwQkAAFbxew1NnTo1vvzyy2IXpU5KoWnDDTfMAtR3ITgBAMAqrCI0tWnTJpo3b75cI8DxvxYuXJjdx/WTTz6J9ddf/zsdO8EJAABW4eZ5FaFp7bXXLnZx6qR11lknC0/z58+PRo0arfD7GBwCAABWURV9mlJNEyumooleCqHfheAEAACrOM3zin/sBCcAAIAcghMAALDKuPnmm6NVq1axqjE4BAAA1DGdzn6wVrf3/qX7fOvXHHXUUXHLLbcsMf/tt9+OTTbZJOoawQkAAFgp9txzz7jpppuWGOWuLtJUDwAAWCmaNGkS7dq1qzJdc801sfXWW8dqq60WHTt2jJNOOin+85//LPU9pk+fHj169IgDDzww5syZk92bafjw4dlNbZs1axZdunSJO++8c6Xvi+AEAADUmvr168e1114br732WtaU74knnoizzjqr2nWnTJkSO+64Y3Tu3DkLRymIpdB06623xsiRI7P3OO200+KII46Ip556auWWO4ror3/9a+y3337Rvn37bJjAe+65J/c148ePj2233TY7aKltZOo8BgAArHoeeOCBWH311SunQw45JE499dTYddddo1OnTrHbbrvFRRddFHfccccSr33zzTejV69e0adPn6y5X4MGDbIap0suuSRGjx6dzd9oo42yvlQpOP32t78t3T5OX3/9dVa1dswxx8R//dd/5a7/3nvvxT777BMnnHBC3HbbbTFu3Lg47rjjYt11180OHAAAsOrYdddd44Ybbqh8nprnPf7441mt0RtvvBEzZ86M+fPnx+zZs2PWrFmVN/r95ptvspqmH//4x3H11VdXvv5f//pXtt7uu+9eZTtz586Nbt26lW5w2muvvbJpeaXquNSW8Yorrsieb7HFFvH000/HVVddJTgBAMAqZrXVVqsygt77778f++67b5x44olx8cUXx1prrZVdzx977LFZ+KkITql1We/evbMaqzPPPDM6dOiQza/oC/Xggw9WzquQXrMy1alR9SZMmJAdwEWlwJSq+5YmVeelqUJKtQAAQO178cUXs8EdUkVI6uuUVNdMLy37/e9/n9U4pVqr1F0nde/Zcssts4A0efLk2HnnnWu17HUqOE2dOjXatm1bZV56nsJQqs5Lo2osLlUDDhs2rGYLckHLmn2/FSrDjCJvv2V5739WhjI/BuW+/1kZyvwYFHv/szKU+TEo9v5nZSjzY1Du+5+VocyPwcre/9U7RvS6IuLTbyIa1otV0scvVT9/1hcRs7+qsnyTFvNj3rx58ZuLfh777b5TPPP8pBg54rr/XfjJKxGz1oj49wcRhQXRYNorcduvzoh+J50Tu+3UK8bf+bto16Z1nPGTI+K0U34WCz9/L36wXdeY0XS9eOaZZ6JFixYxYMCAlbabdSo4rYghQ4bE4MGDK5+nkJWGPQQAgLrq/Z+1/78n7Vdu356a1GWr78WVQwfHZdffHEOGXxc7/b9uMXzIoOh/yvnVrt+wYcP44/WXxGEnnh27HfqTGH/nqLjwrJNinbXXjOHX3RTvTv4wWrVaMxs87pxzzlmpZa9TwSmN+z5t2rQq89LzlC6rq21KUlXeym7vCAAAVHXz1dW3+jrt+COyaVFHHrxv5eOjDts/mxYNT3eN+nWV9U857sfZVJvBsU7dx6lnz57ZSHqLeuyxx7L5AAAAJRmc0qgYkyZNyqaK4cbT49TZq6KZXf/+/SvXT8OQv/vuu9kNstLwhddff33WmSzd9AoAAKAkg9MLL7yQjbdeMeZ66ouUHp9//v+2cfzkk08qQ1SShiJPQw+mWqZ0/6c0Gsd///d/G4ocAAAo3T5Ou+yySxQKhaUuv/nmm6t9zUsvLWXkDgAAgHLv4wQAAFAMghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAHXSUacOjb59+5b+cOQAAMAK+N0utbu9C2asUKi55U/3Z48bNmwY663bJg7Zt3f88owTo2nTJlHXCE4AAMBKseeuO8RNV14Q8+bNjxf/8XoMOHVo1KtXLy4795SoazTVAwAAVoomjRtHuzato2OHdtF3z12j947bxWN//Z9s2cKFC2P4b0bHhv9v32i2cc/o0vuwuPOBxytfu2DBgjj29GGVyzfb8cC45r/HFG1f1DgBAAAr3atv/CuefeGV2KBDu+x5Ck1/uPuhGHnpObHphuvHX/8+MY742S9inbXXjJ17ds+CVWre96ffXh5rr9kynn3h5Tj+rIti3Tat49D996j18gtOAADASvHA43+L1TftFfMXLIg5c+ZG/fr147qLfp49vuQ3o+Px22+Inj26ZOtutMF68fTzk+K3f7grC06NGjWKYWecWPleG67fISa8+Erccf9jghMAAFA6dt2hR9wwfEh8PWt2XDXqtmjYsEEctM8P47U334lZ38yO3fudVGX9ufPmRbfOm1c+H3Hz2Bh9+70x+aOp8c3sOdnyrlttVoQ9EZwAAICVZLXmzWKTDdfPHo++cmh02f1HceMf74nOm22czXvw1mujQ7t1lugXldx+76NxxoVXxxXnnRY9e2wTa6zWPH51w63xPy+9WoQ9EZwAAIBaUL9+/Tjnp8fE4GFXxlt/+3M0adI4Jn/0SdYsrzrPPD8pdui+TZx01KGV89754MMoFqPqAQAAteKQfXtHg/r1s35MZ/zkyDjtgivjljvuj3fenxIT//F6/Gb07dnzJA0Y8cIrr8ej45+Nt975IM67/Pp4/uV/Fq3sapwAAKCuOX78/z1u3y3qioYNG8agow+Ly6+/Jd77+wPZCHrDr7sp3p38YbRqsUZsu/XmWa1U8pMjDoqXXn0jDjvx7OzeT/0O2DNOGnBIPPzEM8Upe1G2CgAAlLSbrx5W7fyzBx2dTckpx/04m6qTmvLddNWwbFrU8CE/rbqNWgqOmuoBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAKyqCoVF/7ACCjV08AQnAABYRTWa80XEgrkxa16xS1J3zZ07N/vboEGD7/Q+hiMHAIBVVIP5s6LVBw/Hp40PjohW0bxRRL16i600e3YU1fwiV4ctY/8XLlwY06dPj+bNm2f3kPouBCcAAFiFtXt7TPb30w32imjQeMkVvn4viurL6cXdfs7+169fP9Zff/3sJrrfheAEAACrsHpRiHXfvi3avHt3zGu69pJVToNeiKK67pDibj9n/xs3bpyFp+9KcAIAgDqgwYJvosHXHy65oGnTKKr/TCnu9mtp/w0OAQAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAWNWD04gRI6JTp07RtGnT2H777eO5555b5vpXX311bLbZZtGsWbPo2LFjnHbaaTF79uxaKy8AAFB+ihqcxo4dG4MHD46hQ4fGxIkTo0uXLtGnT5/49NNPq11/zJgxcfbZZ2frv/7663HjjTdm73HOOefUetkBAIDyUdTgdOWVV8bAgQPj6KOPji233DJGjhwZzZs3j9GjR1e7/rPPPhu9evWKH//4x1kt1R577BH9+vXLraUCAACok8Fp7ty58eKLL0bv3r3/rzD162fPJ0yYUO1rdthhh+w1FUHp3XffjYceeij23nvvpW5nzpw5MXPmzCoTAADAt9EwiuSzzz6LBQsWRNu2bavMT8/feOONal+TaprS637wgx9EoVCI+fPnxwknnLDMpnrDhw+PYcOG1Xj5AQCA8lH0wSG+jfHjx8cll1wS119/fdYn6u67744HH3wwLrzwwqW+ZsiQITFjxozKacqUKbVaZgAAoO4rWo1T69ato0GDBjFt2rQq89Pzdu3aVfua8847L4488sg47rjjsudbb711fP3113H88cfHueeemzX1W1yTJk2yCQAAoM7VODVu3Di6d+8e48aNq5y3cOHC7HnPnj2rfc2sWbOWCEcpfCWp6R4AAEBJ1TglaSjyAQMGRI8ePWK77bbL7tGUapDSKHtJ//79o0OHDlk/pWS//fbLRuLr1q1bds+nf/3rX1ktVJpfEaAAAABKKjgddthhMX369Dj//PNj6tSp0bVr13jkkUcqB4yYPHlylRqmX/ziF1GvXr3s70cffRTrrLNOFpouvvjiIu4FAABQ6ooanJJBgwZl09IGg1hUw4YNs5vfpgkAAKC21KlR9QAAAIpBcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHA3zVgAAWJpOs8cUdfvvF3XrQDlR4wQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5GiYtwIA1es0e0xRt/9+UbcOAOVFjRMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADncxwlYIe5hBACUEzVOAAAAOdQ4AcAKUvMKUD4Epzr4H2XiP0sAAKg9muoBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMjhBrjUyZsAuwEwAAC1SXACoE7+gJL4EQWA2qKpHgAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAArOqj6o0YMSJ+9atfxdSpU6NLly7xm9/8Jrbbbrulrv/ll1/GueeeG3fffXd88cUXscEGG8TVV18de++9d62WG4o9opjRxAAAyiQ4jR07NgYPHhwjR46M7bffPgtAffr0iTfffDPatGmzxPpz586N3XffPVt25513RocOHeKDDz6IVq1aFaX8AABAeShqcLryyitj4MCBcfTRR2fPU4B68MEHY/To0XH22WcvsX6an2qZnn322WjUqFE2r1OnTrVebgAAoLwUrY9Tqj168cUXo3fv3v9XmPr1s+cTJkyo9jX33Xdf9OzZM04++eRo27ZtdO7cOS655JJYsGDBUrczZ86cmDlzZpUJAACgTgSnzz77LAs8KQAtKj1P/Z2q8+6772ZN9NLrHnrooTjvvPPiiiuuiIsuumip2xk+fHi0bNmycurYsWON7wsAAFDaVqipXgouN998c4wbNy4+/fTTWLhwYZXlTzzxRKwMaTupf9Pvfve7aNCgQXTv3j0++uijbHCJoUOHVvuaIUOGZP2oKqQaJ+EJAABY6cHplFNOyYLTPvvskzWXq1ev3rd+j9atW2fhZ9q0aVXmp+ft2rWr9jXrrrtu1rcpva7CFltskdVQpaZ/jRs3XuI1TZo0ySYAAIBaDU6333573HHHHd9pCPAUclKNUaq16tu3b2WNUno+aNCgal/Tq1evGDNmTLZe6g+VvPXWW1mgqi40AQAAFK2PUwopm2yyyXfeeGpCN2rUqLjlllvi9ddfjxNPPDG+/vrrylH2+vfvnzW1q5CWp1H1Uo1XCkxpBL40OEQaLAIAAGCVqnE6/fTT45prronrrrtuhZrpVTjssMNi+vTpcf7552fN7bp27RqPPPJI5YARkydPrqxZSlLfpEcffTROO+202GabbbL7OKUQ9fOf/3yFywAAALBSgtPTTz8dTz75ZDz88MOx1VZbVd5TqcLdd9+93O+VmuUtrWne+PHjl5iXhiP/+9//vgKlBgAAqMXg1KpVqzjwwANXcJMAAABlEJxuuummmi8JAABAKQWnCql/0ptvvpk93myzzWKdddapqXIBAADU7VH10sh3xxxzTDYM+E477ZRN7du3j2OPPTZmzZpV86UEAACoa8EpDSP+1FNPxf333x9ffvllNt17773ZvDTiHgAAQJR7U7277ror7rzzzthll10q56Wb4TZr1iwOPfTQuOGGG2qyjAAAAHWvxik1x6u419Ki2rRpo6keAABQclYoOKV7KQ0dOjRmz55dOe+bb76JYcOGZcsAAACi3JvqXXPNNdGnT59Yb731okuXLtm8l19+OZo2bRqPPvpoTZcRAACg7gWnzp07x9tvvx233XZbvPHGG9m8fv36xeGHH571cwIAACglK3wfp+bNm8fAgQNrtjQAAAB1OTjdd999sddee0WjRo2yx8uy//7710TZAAAA6lZw6tu3b0ydOjUbOS89Xpp69erFggULaqp8AAAAdSc4LVy4sNrHAAAApW6FhiO/9dZbY86cOUvMnzt3brYMAAAgyj04HX300TFjxowl5n/11VfZMgAAgCj34FQoFLK+TIv78MMPo2XLljVRLgAAgLo5HHm3bt2ywJSmH/7wh9Gw4f+9PA0I8d5778Wee+65MsoJAABQN4JTxWh6kyZNij59+sTqq69euaxx48bRqVOnOOigg2q+lAAAAHUlOA0dOjSrWUoBaY899oh111135ZUMAACgrvZxatCgQfzkJz+J2bNnr5wSAQAAlMLgEJ07d45333235ksDAABQKsHpoosuijPOOCMeeOCB+OSTT2LmzJlVJgAAgLLt41Rh7733zv7uv//+VYYlrximPPWDAgAAKOvg9OSTT9Z8SQAAAEopOO288841XxIAAIBSCk7Jl19+GTfeeGO8/vrr2fOtttoqjjnmmGjZsmVNlg8AAKBuDg7xwgsvxMYbbxxXXXVVfPHFF9l05ZVXZvMmTpxY86UEAACoazVOp512WjYwxKhRo6Jhw/99i/nz58dxxx0Xp556avz1r3+t6XICAADUreCUapwWDU3ZGzVsGGeddVb06NGjJssHAABQN5vqtWjRIiZPnrzE/ClTpsQaa6xRE+UCAACo28HpsMMOi2OPPTbGjh2bhaU03X777VlTvX79+tV8KQEAAOpaU71f//rX2Y1u+/fvn/VtSho1ahQnnnhiXHrppTVdRgAAgLoXnBo3bhzXXHNNDB8+PN55551sXhpRr3nz5jVdPgAAgKJb4fs4JSkotWrVqvIxAABAKVqhPk6ped55552X3ey2U6dO2ZQe/+IXv4h58+bVfCkBAADqWo3TT3/607j77rvj8ssvj549e2bzJkyYEBdccEF8/vnnccMNN9R0OQEAAOpWcBozZkw2it5ee+1VOW+bbbaJjh07ZqPqCU4AAECUe1O9Jk2aZM3zFrfhhhtmA0cAAABEuQenQYMGxYUXXhhz5sypnJceX3zxxdkyAACAKPemei+99FKMGzcu1ltvvejSpUs27+WXX465c+fGD3/4w/iv//qvynVTXygAAICyC05pCPKDDjqoyrzUvwkAAKAUrVBwuummm2q+JAAAAKV4A9zp06fHm2++mT3ebLPNYp111qmpcgEAANTtwSG+/vrrOOaYY2LdddeNnXbaKZvat28fxx57bMyaNavmSwkAAFDXgtPgwYPjqaeeivvvvz++/PLLbLr33nuzeaeffnrNlxIAAKCuNdW766674s4774xddtmlct7ee+8dzZo1i0MPPdQNcAEAgJKyQjVOqTle27Ztl5jfpk0bTfUAAICSs0LBqWfPnjF06NCYPXt25bxvvvkmhg0bli0DAACIcm+qd/XVV8eee+65xA1wmzZtGo8++mhNlxEAAKDuBaett9463n777bjtttvijTfeyOb169cvDj/88KyfEwAAQFkHp3nz5sXmm28eDzzwQAwcOHDllAoAAKAu93Fq1KhRlb5NAAAApW6FBoc4+eST47LLLov58+fXfIkAAABKoY/T888/H+PGjYu//OUvWX+n1VZbrcryu+++u6bKBwAAUDeDU6tWreKggw6q+dIAAADU9eC0cOHC+NWvfhVvvfVWzJ07N3bbbbe44IILjKQHAACUtG/Vx+niiy+Oc845J1ZfffXo0KFDXHvttVl/JwAAgFL2rYLTrbfeGtdff312k9t77rkn7r///uxeTqkmCgAAoFR9q+A0efLk2HvvvSuf9+7dO+rVqxcff/zxyigbAABA3QtOafjxpk2bLnFfp3RTXAAAgFL1rQaHKBQKcdRRR0WTJk0q56Wb4Z5wwglVhiQ3HDkAAFC2wWnAgAFLzDviiCNqsjwAAAB1OzjddNNNK68kAAAApdDHCQAAoBwJTgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAACgLgSnESNGRKdOnaJp06ax/fbbx3PPPbdcr7v99tujXr160bdv35VeRgAAoHwVPTiNHTs2Bg8eHEOHDo2JEydGly5dok+fPvHpp58u83Xvv/9+nHHGGbHjjjvWWlkBAIDyVPTgdOWVV8bAgQPj6KOPji233DJGjhwZzZs3j9GjRy/1NQsWLIjDDz88hg0bFhtttFGtlhcAACg/DYu58blz58aLL74YQ4YMqZxXv3796N27d0yYMGGpr/vlL38Zbdq0iWOPPTb+9re/LXMbc+bMyaYKM2fOrKHSAwDlrtPsMUXd/vtF3TqUl6LWOH322WdZ7VHbtm2rzE/Pp06dWu1rnn766bjxxhtj1KhRy7WN4cOHR8uWLSunjh071kjZAQCA8lH0pnrfxldffRVHHnlkFppat269XK9JtVkzZsyonKZMmbLSywkAAJSWojbVS+GnQYMGMW3atCrz0/N27dotsf4777yTDQqx3377Vc5buHBh9rdhw4bx5ptvxsYbb1zlNU2aNMkmAACAOlnj1Lhx4+jevXuMGzeuShBKz3v27LnE+ptvvnn84x//iEmTJlVO+++/f+y6667ZY83wAACAkqtxStJQ5AMGDIgePXrEdtttF1dffXV8/fXX2Sh7Sf/+/aNDhw5ZX6V0n6fOnTtXeX2rVq2yv4vPBwAAKJngdNhhh8X06dPj/PPPzwaE6Nq1azzyyCOVA0ZMnjw5G2kPAACgbINTMmjQoGyqzvjx45f52ptvvnkllQoAAOB/qcoBAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgR8O8FQAAYGk6zR5T1O2/X9StU07UOAEAAORQ4wQAAKywTmVS66jGCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcDfNWAAAAqtdp9phiFyHeL3YByoQaJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAACgLgSnESNGRKdOnaJp06ax/fbbx3PPPbfUdUeNGhU77rhjrLnmmtnUu3fvZa4PAABQ54PT2LFjY/DgwTF06NCYOHFidOnSJfr06ROffvppteuPHz8++vXrF08++WRMmDAhOnbsGHvssUd89NFHtV52AACgPBQ9OF155ZUxcODAOProo2PLLbeMkSNHRvPmzWP06NHVrn/bbbfFSSedFF27do3NN988/vu//zsWLlwY48aNq/WyAwAA5aGowWnu3Lnx4osvZs3tKgtUv372PNUmLY9Zs2bFvHnzYq211qp2+Zw5c2LmzJlVJgAAgDoTnD777LNYsGBBtG3btsr89Hzq1KnL9R4///nPo3379lXC16KGDx8eLVu2rJxS0z4AAIA61VTvu7j00kvj9ttvjz//+c/ZwBLVGTJkSMyYMaNymjJlSq2XEwAAqNsaFnPjrVu3jgYNGsS0adOqzE/P27Vrt8zX/vrXv86C0+OPPx7bbLPNUtdr0qRJNgEAANTJGqfGjRtH9+7dqwzsUDHQQ8+ePZf6ussvvzwuvPDCeOSRR6JHjx61VFoAAKBcFbXGKUlDkQ8YMCALQNttt11cffXV8fXXX2ej7CX9+/ePDh06ZH2VkssuuyzOP//8GDNmTHbvp4q+UKuvvno2AQAAlFxwOuyww2L69OlZGEohKA0znmqSKgaMmDx5cjbSXoUbbrghG43v4IMPrvI+6T5QF1xwQa2XHwAAKH1FD07JoEGDsmlpN7xd1Pvvv19LpQIAACiBUfUAAABqg+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAAA5BCcAAIAcghMAAEAOwQkAACCH4AQAAJBDcAIAAMghOAEAAOQQnAAAAHIITgAAADkEJwAAgByCEwAAQA7BCQAAIIfgBAAAkENwAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAADqQnAaMWJEdOrUKZo2bRrbb799PPfcc8tc/09/+lNsvvnm2fpbb711PPTQQ7VWVgAAoPwUPTiNHTs2Bg8eHEOHDo2JEydGly5dok+fPvHpp59Wu/6zzz4b/fr1i2OPPTZeeuml6Nu3bza9+uqrtV52AACgPBQ9OF155ZUxcODAOProo2PLLbeMkSNHRvPmzWP06NHVrn/NNdfEnnvuGWeeeWZsscUWceGFF8a2224b1113Xa2XHQAAKA8Ni7nxuXPnxosvvhhDhgypnFe/fv3o3bt3TJgwodrXpPmphmpRqYbqnnvuqXb9OXPmZFOFGTNmZH9nzpy5wuVeOGdWFNt3KX9NKPYxKPb+J+V+DMp9/5NyPwbF3v+k3I9Bsfc/KfdjUO77n5T7MSj2/iflfgxmfof9r3htoVDIX7lQRB999FEqYeHZZ5+tMv/MM88sbLfddtW+plGjRoUxY8ZUmTdixIhCmzZtql1/6NCh2TZMJpPJZDKZTCaTKaqZpkyZkptdilrjVBtSbdaiNVQLFy6ML774ItZee+2oV69eUcqUkm3Hjh1jypQp0aJFiyg35b7/Sbkfg3Lf/6Tcj0G573/iGDgG5b7/Sbkfg3Lf/1XhGKSapq+++irat2+fu25Rg1Pr1q2jQYMGMW3atCrz0/N27dpV+5o0/9us36RJk2xaVKtWrWJVkE6Ocv1HkpT7/iflfgzKff+Tcj8G5b7/iWPgGJT7/iflfgzKff+LfQxatmy56g8O0bhx4+jevXuMGzeuSo1Qet6zZ89qX5PmL7p+8thjjy11fQAAgO+q6E31UjO6AQMGRI8ePWK77baLq6++Or7++utslL2kf//+0aFDhxg+fHj2/JRTTomdd945rrjiithnn33i9ttvjxdeeCF+97vfFXlPAACAUlX04HTYYYfF9OnT4/zzz4+pU6dG165d45FHHom2bdtmyydPnpyNtFdhhx12iDFjxsQvfvGLOOecc2LTTTfNRtTr3Llz1BWp6WC6b9XiTQjLRbnvf1Lux6Dc9z8p92NQ7vufOAaOQbnvf1Lux6Dc97+uHYN6aYSIYhcCAABgVVb0G+ACAACs6gQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACoFbNmTMnmyhv5X4elPv+Q10kONWycvyi/Oc//xknnXRSdOvWLdZdd91sSo/TvLSM0ucc4LHHHou999471lxzzWjevHk2pcdp3uOPP17s4lFLyv08KPf9h7rOfZxq6YvyqquuigkTJsTMmTOzeS1atIiePXvG4MGDo3fv3lGqHn744ejbt29su+220adPn8obG0+bNi07Li+++GLce++92bJSlsLBddddl50D6UbPSbt27bJzYNCgQbHllltGqXIO/K9yPgduueWWOO644+Lggw9e4hz4y1/+EnfeeWfceOONceSRR0YpK+dzICn386Dc97/CZ599FqNHj17i38EOO+wQRx11VKyzzjpR6sr9u6AunweC00pW7l+UXbp0iQMOOCB++ctfVrv8ggsuiLvvvjteeeWVKFXlHhycA86B733ve3HKKafEySefXO3y66+/Pvtx6e23345SVe7nQFLu50G573/y/PPPZ+d4qmlLPxov+u9g3LhxMWvWrHj00UejR48eUap8F0TdPg9ScGLl2XTTTQvXXXfdUpePGDGisMkmmxRKVdOmTQtvvPHGUpenZWmdUrbNNtsUzjvvvKUuHzp0aGHrrbculCrngHOgSZMmzoEyPweScj8Pyn3/k+23375w/PHHFxYuXLjEsjQvLft//+//FUqZ74JCnT4P9HFaySZPnrzMpng//OEP48MPP4xS1alTp3jwwQeXujwt22CDDaKUvfXWW3H44YcvdXm/fv1K+hdG54BzYKuttspq1pcmNdco9aYp5X4OJOV+HpT7/icvv/xynHbaaVGvXr0llqV5admkSZOilPkuiDp9HjQsdgFKXcUX5eWXX16WX5SpedaPf/zjGD9+fLXVsY888kiMGTMmSllFcNhss83KMjg4B5wDV1xxRey7777ZZ13dOfDuu+8uM1yXgnI/B5JyPw/Kff8r+rA899xzsfnmm1e7PC2rOC6lyndB1OnzQHBaycr9i/KQQw6JDh06xLXXXpsdi8U7QaaL6fS3lJV7cHAOOAd22WWXePXVV+OGG26Iv//971XOgb322itOOOGE7GKilJX7OZCU+3lQ7vufnHHGGXH88cdn/XhSi5vF/x2MGjUqfv3rX0cp810Qdfo8MDhELXj//fer/aJMF4vl8EVJxLPPPpsFh+pG0EmdhUs9OOAcwDkAydixY7NBMNJF84IFC7J5DRo0iO7du2cjDR966KFR6nwXRJ09DwQnAABq1bx587IhqZPWrVtHo0aNil0kimBeHTsPDA5BUZ1zzjlxzDHHFLsYFJFzgAEDBsRuu+1W7GJQZOV+HpTb/qcL5Ioboq/qF8usPI3q2HkgOBVZuX1RLu6jjz7KmjKWs3IPDs4B50DqA1fqnaHzlPs5kJT7eVDu+19xL6ul3fOvXPguiFX6PNBUr8iGDBmStW+96aabil0UiqR///7ZkPRPPPFElJP01VPdUKTlqFzPAar+iDZlyhTnAGUtDRTw3nvvZQNnlSvfBbFKnweCE1AUjRs3zu7lsMUWWxS7KAC14pNPPskGi3r66aezx/Xr14+NNtoo+vbtG0cddVTWOR5YdWmqV2TpV4VSr5L95ptvsv8k/vnPfy6xbPbs2XHrrbdGqXv99dezWsU33ngje57+nnjiidlnX+q/KqXRcaqb0ig6l156aeXzcvL1119n58O5554b1113XXz++edRyiZOnJj9eljh97//ffTq1Ss6duwYP/jBD+L222+PUvfTn/40/va3v0W5S+d7qmGt+MzTuZDuZZju55KaKM2fPz9K1QsvvJD9UPTQQw9lHeLTTU7TCGKrrbZaNjzzTjvtFF999VWxiwksS6pxongmTZpUqF+/fqFUvfnmm4UNNtigUK9evWw/d9ppp8LHH39cuXzq1Kklvf/Jww8/XGjcuHFhrbXWKjRt2jR7vs466xR69+5d2G233QoNGjQojBs3rlCq0mfftWvXwi677FJlSvO///3vZ4933XXXQinbYostCp9//nn2ePLkyYVOnToVWrZsme1/Oi/atGlTePfddwulaptttik89thj2eNRo0YVmjVrVvjZz35WuOGGGwqnnnpqYfXVVy/ceOONhVJW8R246aabFi699NLCJ598Uig3F154YWGNNdYoHHTQQYV27dplx2HttdcuXHTRRYVLLrkk+148//zzC6WqV69ehQsuuKDy+e9///vC9ttvnz3+4osvsu/J9O+i1M2ZM6cwduzY7N/+j370o2xKj++4445sWblL10XDhg0rlIMpU6YUvvrqqyXmz507t/DUU08VVkWC00p27733LnO66qqrSjo49O3bt7DPPvsUpk+fXnj77bezxxtuuGHhgw8+KJvg1LNnz8K5556bPf7jH/9YWHPNNQvnnHNO5fKzzz67sPvuuxdK1fDhw7PPfPFw2LBhw8Jrr71WKAfponnatGnZ48MPP7ywww47FL788svsefpPI4Xofv36FUpVCkrvv/9+9rhbt26F3/3ud1WW33bbbYUtt9yyUOrnwOOPP1445ZRTCq1bty40atSosP/++xfuv//+woIFCwrlYOONNy7cddddlT8aph+N/vCHP1Quv/vuuwubbLJJEUu48v8dvPPOO5XP0+eezoP0/2Dyl7/8pdC+fftCKUvXARtttFH2I+LOO+9cOPTQQ7MpPU7z0uef1ilnpf6DepJ+QE8/HKb9TN8DRx55ZJUAtSpfGwpOtfQrY/q7tGlVPTlqQvol/ZVXXql8vnDhwsIJJ5xQWH/99bP/QFblfxw1pUWLFpX/EaT/KFNgmDhxYuXyf/zjH4W2bdsWStlzzz1X+N73vlc4/fTTs1+Syjk4pYuGdIG0qGeeeabQsWPHQqlKtQovvPBC5XdCujBY1L/+9a/sorJczoH0byD94t6nT5/soiFdLKcfU0r9gjF9xhU/miUpNLz66quVz1O4bt68eaFUpdYXTz/9dJWLx3RezJo1K3v+3nvvZeGhlKUfiQ444IDCjBkzlliW5qVle+yxR6GUvfzyy8uc0ndDqV8X9e/fP6ttff7557PWCN27dy/06NEjq3lN0rVh+rexKtLHaSVL49LffffdsXDhwmqn1Pa/1Ps3NWzYsPJ5GkUtdYzdb7/9Yuedd4633norykHF6HGpI3DTpk2jZcuWlcvWWGONmDFjRpSy73//+9ndwadPnx49evSIV199texG1KvY39SvL30vLD4McTo2pWqvvfbK/t0n6d/9nXfeWWX5HXfcEZtsskmUi3SvkkMPPTQeeeSRbNSogQMHxm233RabbbZZlLJ27dpV9nVN/XtSP8dF+76+9tpr0aZNmyhVaQCIE044Ifvcn3zyyTj88MOzfw/NmjXLlr/55pvZd0Epe+aZZ+Kiiy6KFi1aLLEszbvwwgtLvi9g165do1u3btnfxac0/0c/+lGUuscffzyuvfba7Hqgd+/e2XmR/l9Mt+f54osvsnVW1WuE/7uiZaVIHT/TBeMBBxxQ7fJ0YpTywIapw29Fh9jFOwgn+++/f5S6Tp06ZRcJG2+8cfZ8woQJsf7661cunzx58hIX0qVo9dVXj1tuuSXrFJ6+KNNFU7kNr5p+RJg5c2Z2gdS5c+fKZR988EGsvfbaUaouu+yybDCIdJGY/qO84oorYvz48dn3QjoWf//73+PPf/5zlKP0XXDBBRfE0KFDs4uJUpaCQhoYIv1/OG7cuDjrrLOyQRHS4Cjp/8KLL744Dj744ChVKTCkkfTSD4fp+69nz57xhz/8oXJ5OgbDhw+PUtaqVavsvn2Lfv8tKi1L65SytdZaKy6//PLs/4TqpB8Q0jlSymbMmBFrrrlm5fMmTZpklQyHHHJI7LrrrlX+XaxqBKeV7Mwzz8xG0Fqa9Ctr+uWpVB144IHxxz/+MY488sgllqXwlGrdRo4cGaUsjZ63aEhY/D+Mhx9+uKxugpx+TUsjqaUfFMrlZo/ponjxELmo+++/P3bccccoVe3bt4+XXnopG0Ux7Wv6sei5557LRhVNgSr92pgCVSlL5/qyhppOF8277757lLJhw4ZltSvpx6NUy3b22WdHly5dsgA1a9as7GIx1TiUqvTvfuzYsVmtcxo9cPHvgT322CNK3XHHHZeF5/POOy8LDm3bts3mT5s2LQvTKVymEShL/Qf1jz/+eKn//3355Zcl/YN6kobgf+WVV2LTTTeNCumHxT/96U9ZeNp3331jVeU+TgAA1FoN9DXXXBNTp06tbI6VLkVTU85TTz01C9KlLNWupx/UjzjiiGqX//vf/4777rsvuxFuqfr5z38ekyZNikcffXSJZelHhYMOOij7kS39uL6qEZwAAKhV6d5uKTwlKTRtuOGGxS4StWT+/PlZLXN1fd0qln/00UerZKsUg0MAAFCrUlBK/bzSVBGaUvPddGP4clYOx6Bhw4ZLDU1J6guYmvauitQ4AQBQdC+//HJsu+22ZTd40KIcg1ilj4HBIQAAWOlS351lScPzlzrHIOr0MVDjBADASpfuZZh3G5a0fFWsaagpjkHU6WOgjxMAACtdumdhul9PGi2tumnixIlR6hyDqNPHQHACAKBW7mGU7uG3NHm1EKXAMYg6fQz0cQIAYKU788wzs3sYLc0mm2wSTz75ZJQyxyDq9DHQxwkAACCHpnoAAAA5BCcAAIAcghMAAEAOwQkAvoU04tM999xT7GIAUMsEJwDqhKOOOioLLSeccMISy04++eRsWVqnplxwwQXRtWvXGns/AOo2wQmAOqNjx45x++23xzfffFM5b/bs2TFmzJhYf/31i1o2AEqb4ARAnbHttttm4Snddb5CepxCU7du3SrnzZkzJ372s59FmzZtomnTpvGDH/wgnn/++crl48ePz2qoxo0bFz169IjmzZvHDjvsEG+++Wa2/Oabb45hw4bFyy+/nK2XpjSvwmeffRYHHnhg9rpNN9007rvvvlo7BgAUh+AEQJ1yzDHHxE033VT5fPTo0XH00UdXWeess86Ku+66K2655ZaYOHFidkPFPn36xBdffFFlvXPPPTeuuOKKeOGFF6Jhw4bZeyeHHXZYnH766bHVVlvFJ598kk1pXoUUqg499NB45ZVXYu+9947DDz98ifcGoLQITgDUKUcccUQ8/fTT8cEHH2TTM888k82rkO5If8MNN8SvfvWr2GuvvWLLLbeMUaNGRbNmzeLGG2+s8l4XX3xx7Lzzztk6Z599djz77LNZ07+07uqrr56FqXbt2mVTmlch9aXq169fFsguueSS+M9//hPPPfdcrR4HAGpXw1reHgB8J+uss07ss88+WdO5QqGQPW7dunXl8nfeeSfmzZsXvXr1qpzXqFGj2G677eL111+v8l7bbLNN5eN11103+/vpp5/m9pda9HWrrbZatGjRInsdAKVLcAKgzklN6gYNGpQ9HjFixAq/TwpUFVI/pmThwoXf6nUVr12e1wFQd2mqB0Cds+eee8bcuXOzmqXUd2lRG2+8cTRu3DhrwlchrZcGh0hN8pZXeo8FCxbUaLkBqLvUOAFQ5zRo0KCy2V16vKjUdO7EE0+MM888M9Zaa62s2d3ll18es2bNimOPPXa5t9GpU6d47733YtKkSbHeeuvFGmusEU2aNKnxfQGgbhCcAKiTUr+ipbn00kuzpnNHHnlkfPXVV9mQ448++misueaay/3+Bx10UDbU+a677hpffvllNpJfTd5gF4C6pV4h9awFAABgqfRxAgAAyCE4AQAA5BCcAAAAcghOAAAAOQQnAACAHIITAABADsEJAAAgh+AEAACQQ3ACAADIITgBAADkEJwAAAByCE4AAACxbP8fN6fAZF4kx6UAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train_data['year'] = train_data['date_parsed'].dt.year\n", + "train_data['month'] = train_data['date_parsed'].dt.month\n", + "train_data['year_month'] = train_data['date_parsed'].dt.to_period('M')\n", + "train_data['day_of_week'] = train_data['date_parsed'].dt.dayofweek \n", + "\n", + "def plot_date_features(df, column_name, title, xlabel):\n", + " df_plot = df.groupby(column_name)['label'].value_counts(normalize=True).unstack().fillna(0)\n", + " \n", + " if 0 not in df_plot.columns:\n", + " df_plot[0] = 0\n", + " if 1 not in df_plot.columns:\n", + " df_plot[1] = 0\n", + " \n", + " df_plot.plot(kind='bar', stacked=True, figsize=(10, 6))\n", + " plt.title(title)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel('Proportion')\n", + " plt.legend(['Fake', 'Real'])\n", + " plt.show()\n", + "\n", + "plot_date_features(train_data, 'year', 'Class proportion by year', 'Year')\n", + "plot_date_features(train_data, 'month', 'Class proportion by month', 'Month')\n", + "plot_date_features(train_data, 'day_of_week', 'Class proportion by day of the week', 'Day of the week (Mon=0, Sun=6)')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Aggregate data by month and calculate normalized proportions\n", + "monthly_proportions = train_data.groupby('year_month')['label'].value_counts(normalize=True).unstack().fillna(0)\n", + "\n", + "# Build a line chart\n", + "plt.figure(figsize=(15, 7))\n", + "plt.plot(monthly_proportions.index.astype(str), monthly_proportions[0], label='Fake News', marker='o')\n", + "plt.plot(monthly_proportions.index.astype(str), monthly_proportions[1], label='Real News', marker='o')\n", + "\n", + "plt.title('Proportion of fake and real news by month')\n", + "plt.xlabel('Year and month')\n", + "plt.ylabel('Proportion of total news for the month')\n", + "plt.xticks(rotation=45)\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "numerical_features = [\n", + " 'label', 'text_length', 'title_length', 'word_count', \n", + " 'title_word_count', 'year', 'month', 'day_of_week'\n", + "]\n", + "\n", + "corr_data = train_data[numerical_features].copy()\n", + "\n", + "\n", + "corr_data = corr_data.fillna(0)\n", + "\n", + "\n", + "correlation_matrix = corr_data.corr()\n", + "\n", + "\n", + "plt.figure(figsize=(10, 8))\n", + "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)\n", + "plt.title('Corr matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Key Insights and Recommendations" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== KEY INSIGHTS ===\n", + "\n", + "🔍 DATA CHARACTERISTICS:\n", + "• The dataset is well-balanced with roughly equal fake and real news\n", + "• Text lengths vary significantly, indicating diverse article types\n", + "• Multiple subjects are represented, providing good coverage\n", + "• Date range spans multiple years, allowing temporal analysis\n", + "\n", + "📊 DISTRIBUTION PATTERNS:\n", + "• Fake and real news have similar length distributions\n", + "• Subject distribution shows some bias toward certain topics\n", + "• Word frequency analysis reveals common terms in both categories\n", + "\n", + "⚠️ POTENTIAL CHALLENGES:\n", + "• Some articles may have inconsistent formatting\n", + "• Date parsing may require additional cleaning\n", + "• Text preprocessing will be crucial for model performance\n", + "\n", + "💡 RECOMMENDATIONS:\n", + "• Use text preprocessing to clean and normalize content\n", + "• Consider feature engineering (length, word count, etc.)\n", + "• Implement cross-validation for robust model evaluation\n", + "• Use ensemble methods to improve classification accuracy\n", + "• Consider temporal features if date parsing is successful\n" + ] + } + ], + "source": [ + "# Key insights\n", + "print(\"=== KEY INSIGHTS ===\")\n", + "\n", + "print(\"\\n🔍 DATA CHARACTERISTICS:\")\n", + "print(\"• The dataset is well-balanced with roughly equal fake and real news\")\n", + "print(\"• Text lengths vary significantly, indicating diverse article types\")\n", + "print(\"• Multiple subjects are represented, providing good coverage\")\n", + "print(\"• Date range spans multiple years, allowing temporal analysis\")\n", + "\n", + "print(\"\\n📊 DISTRIBUTION PATTERNS:\")\n", + "print(\"• Fake and real news have similar length distributions\")\n", + "print(\"• Subject distribution shows some bias toward certain topics\")\n", + "print(\"• Word frequency analysis reveals common terms in both categories\")\n", + "\n", + "print(\"\\n⚠️ POTENTIAL CHALLENGES:\")\n", + "print(\"• Some articles may have inconsistent formatting\")\n", + "print(\"• Date parsing may require additional cleaning\")\n", + "print(\"• Text preprocessing will be crucial for model performance\")\n", + "\n", + "print(\"\\n💡 RECOMMENDATIONS:\")\n", + "print(\"• Use text preprocessing to clean and normalize content\")\n", + "print(\"• Consider feature engineering (length, word count, etc.)\")\n", + "print(\"• Implement cross-validation for robust model evaluation\")\n", + "print(\"• Use ensemble methods to improve classification accuracy\")\n", + "print(\"• Consider temporal features if date parsing is successful\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labeltitletextsubjectdatedate_cleandate_parsedtext_lengthtitle_lengthword_counttitle_word_countyearmonthyear_monthday_of_week
01As U.S. budget fight looms, Republicans flip t...WASHINGTON (Reuters) - The head of a conservat...politicsNewsDecember 31, 2017December 31, 20172017-12-31465964749102017.012.02017-126.0
11U.S. military to accept transgender recruits o...WASHINGTON (Reuters) - Transgender people will...politicsNewsDecember 29, 2017December 29, 20172017-12-2940776462492017.012.02017-124.0
21Senior U.S. Republican senator: 'Let Mr. Muell...WASHINGTON (Reuters) - The special counsel inv...politicsNewsDecember 31, 2017December 31, 20172017-12-31278960457102017.012.02017-126.0
31FBI Russia probe helped by Australian diplomat...WASHINGTON (Reuters) - Trump campaign adviser ...politicsNewsDecember 30, 2017December 30, 20172017-12-3024615937692017.012.02017-125.0
41Trump wants Postal Service to charge 'much mor...SEATTLE/WASHINGTON (Reuters) - President Donal...politicsNewsDecember 29, 2017December 29, 20172017-12-29520469852112017.012.02017-124.0
................................................
399370THIS IS NOT A JOKE! Soros-Linked Group Has Pla...The Left has been organizing for decades, and ...left-newsSep 22, 2016Sep 22, 20162016-09-225026124841202016.09.02016-093.0
399380THE SMARTEST WOMAN In Politics: “How Trump Can...Monica Crowley offers some of the most brillia...left-newsSep 22, 2016Sep 22, 20162016-09-22465185791152016.09.02016-093.0
399390BREAKING! SHOCKING VIDEO FROM CHARLOTTE RIOTS:...Protest underway in Charlotte: Things got com...left-newsSep 21, 2016Sep 21, 20162016-09-211898126122016.09.02016-092.0
399400BREAKING! Charlotte News Station Reports Cops ...Local Charlotte, NC news station WSOCTV is rep...left-newsSep 21, 2016Sep 21, 20162016-09-212302110380182016.09.02016-092.0
399410BIG MISTAKE! HILLARY JUST Proved To America Sh...Dividing America will be Obama s legacy. Hilla...left-newsSep 21, 2016Sep 21, 20162016-09-21989102171162016.09.02016-092.0
\n", + "

39942 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " label title \\\n", + "0 1 As U.S. budget fight looms, Republicans flip t... \n", + "1 1 U.S. military to accept transgender recruits o... \n", + "2 1 Senior U.S. Republican senator: 'Let Mr. Muell... \n", + "3 1 FBI Russia probe helped by Australian diplomat... \n", + "4 1 Trump wants Postal Service to charge 'much mor... \n", + "... ... ... \n", + "39937 0 THIS IS NOT A JOKE! Soros-Linked Group Has Pla... \n", + "39938 0 THE SMARTEST WOMAN In Politics: “How Trump Can... \n", + "39939 0 BREAKING! SHOCKING VIDEO FROM CHARLOTTE RIOTS:... \n", + "39940 0 BREAKING! Charlotte News Station Reports Cops ... \n", + "39941 0 BIG MISTAKE! HILLARY JUST Proved To America Sh... \n", + "\n", + " text subject \\\n", + "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n", + "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n", + "2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n", + "3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews \n", + "4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews \n", + "... ... ... \n", + "39937 The Left has been organizing for decades, and ... left-news \n", + "39938 Monica Crowley offers some of the most brillia... left-news \n", + "39939 Protest underway in Charlotte: Things got com... left-news \n", + "39940 Local Charlotte, NC news station WSOCTV is rep... left-news \n", + "39941 Dividing America will be Obama s legacy. Hilla... left-news \n", + "\n", + " date date_clean date_parsed text_length \\\n", + "0 December 31, 2017 December 31, 2017 2017-12-31 4659 \n", + "1 December 29, 2017 December 29, 2017 2017-12-29 4077 \n", + "2 December 31, 2017 December 31, 2017 2017-12-31 2789 \n", + "3 December 30, 2017 December 30, 2017 2017-12-30 2461 \n", + "4 December 29, 2017 December 29, 2017 2017-12-29 5204 \n", + "... ... ... ... ... \n", + "39937 Sep 22, 2016 Sep 22, 2016 2016-09-22 5026 \n", + "39938 Sep 22, 2016 Sep 22, 2016 2016-09-22 4651 \n", + "39939 Sep 21, 2016 Sep 21, 2016 2016-09-21 189 \n", + "39940 Sep 21, 2016 Sep 21, 2016 2016-09-21 2302 \n", + "39941 Sep 21, 2016 Sep 21, 2016 2016-09-21 989 \n", + "\n", + " title_length word_count title_word_count year month year_month \\\n", + "0 64 749 10 2017.0 12.0 2017-12 \n", + "1 64 624 9 2017.0 12.0 2017-12 \n", + "2 60 457 10 2017.0 12.0 2017-12 \n", + "3 59 376 9 2017.0 12.0 2017-12 \n", + "4 69 852 11 2017.0 12.0 2017-12 \n", + "... ... ... ... ... ... ... \n", + "39937 124 841 20 2016.0 9.0 2016-09 \n", + "39938 85 791 15 2016.0 9.0 2016-09 \n", + "39939 81 26 12 2016.0 9.0 2016-09 \n", + "39940 110 380 18 2016.0 9.0 2016-09 \n", + "39941 102 171 16 2016.0 9.0 2016-09 \n", + "\n", + " day_of_week \n", + "0 6.0 \n", + "1 4.0 \n", + "2 6.0 \n", + "3 5.0 \n", + "4 4.0 \n", + "... ... \n", + "39937 3.0 \n", + "39938 3.0 \n", + "39939 2.0 \n", + "39940 2.0 \n", + "39941 2.0 \n", + "\n", + "[39942 rows x 15 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2.1 enhanced_classifier.ipynb b/2.1 enhanced_classifier.ipynb new file mode 100644 index 0000000..7641825 --- /dev/null +++ b/2.1 enhanced_classifier.ipynb @@ -0,0 +1,728 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Enhanced Fake News Classifier\n", + "\n", + "## Overview\n", + "This notebook implements an enhanced fake news classifier with:\n", + "- Robust text preprocessing (cleaning, stopword removal, lemmatization)\n", + "- Advanced feature engineering (length, punctuation, lexical diversity, subject, dates)\n", + "- TF-IDF vectorization and numerical feature scaling\n", + "- Multiple models (Logistic Regression, Random Forest, Gradient Boosting, SVM)\n", + "- Soft-voting ensemble and cross-validation\n", + "- Predictions on validation set with confidence scores\n", + "\n", + "Run cells top-to-bottom.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setup complete\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /Users/sergej/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "# Imports and setup\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Text processing\n", + "import re\n", + "from collections import Counter\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# ML\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "from sklearn.preprocessing import StandardScaler\n", + "from scipy.sparse import hstack\n", + "\n", + "# NLTK data\n", + "for pkg, path in [('punkt','tokenizers/punkt'),('stopwords','corpora/stopwords'),('wordnet','corpora/wordnet')]:\n", + " try:\n", + " nltk.data.find(path)\n", + " except LookupError:\n", + " nltk.download(pkg)\n", + "\n", + "print('Setup complete')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data...\n", + "Training: (39942, 5), Validation: (4956, 5)\n" + ] + } + ], + "source": [ + "# Load data\n", + "print('Loading data...')\n", + "train_data = pd.read_csv('dataset/data.csv')\n", + "validation_data = pd.read_csv('dataset/validation_data.csv')\n", + "print(f'Training: {train_data.shape}, Validation: {validation_data.shape}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preprocessing functions ready\n" + ] + } + ], + "source": [ + "# Preprocessing functions\n", + "def clean_text(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " text = text.lower()\n", + " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", + " text = ' '.join(text.split())\n", + " return text\n", + "\n", + "def remove_stopwords(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " try:\n", + " stop_words = set(stopwords.words('english'))\n", + " words = word_tokenize(text)\n", + " return ' '.join([w for w in words if w.lower() not in stop_words])\n", + " except:\n", + " return text\n", + "\n", + "def lemmatize_text(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " try:\n", + " lemmatizer = WordNetLemmatizer()\n", + " words = word_tokenize(text)\n", + " return ' '.join([lemmatizer.lemmatize(w) for w in words])\n", + " except:\n", + " return text\n", + "\n", + "def preprocess_text(text):\n", + " text = clean_text(text)\n", + " text = remove_stopwords(text)\n", + " text = lemmatize_text(text)\n", + " return text\n", + "\n", + "print('Preprocessing functions ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Date parser ready\n" + ] + } + ], + "source": [ + "# Advanced date parsing\n", + "\n", + "def parse_dates_robust(date_series):\n", + " formats = ['%B %d, %Y', '%d-%b-%y', '%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y']\n", + " parsed_dates = pd.Series([pd.NaT] * len(date_series), index=date_series.index)\n", + " for fmt in formats:\n", + " try:\n", + " temp = pd.to_datetime(date_series, format=fmt, errors='coerce')\n", + " mask = parsed_dates.isna() & temp.notna()\n", + " parsed_dates[mask] = temp[mask]\n", + " except:\n", + " continue\n", + " return parsed_dates\n", + "\n", + "print('Date parser ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Engineering features...\n", + "Features ready\n" + ] + } + ], + "source": [ + "# Feature engineering\n", + "print('Engineering features...')\n", + "\n", + "# Process text\n", + "data = train_data.copy()\n", + "data['text_processed'] = data['text'].apply(preprocess_text)\n", + "data['title_processed'] = data['title'].apply(preprocess_text)\n", + "\n", + "# Basic lengths\n", + "data['text_length'] = data['text'].str.len()\n", + "data['title_length'] = data['title'].str.len()\n", + "data['word_count'] = data['text'].str.split().str.len()\n", + "data['title_word_count'] = data['title'].str.split().str.len()\n", + "\n", + "# Advanced text features\n", + "data['avg_word_length'] = data['text'].str.split().str.join(' ').str.len() / data['word_count']\n", + "data['title_avg_word_length'] = data['title'].str.split().str.join(' ').str.len() / data['title_word_count']\n", + "\n", + "# Punctuation features (escape regex)\n", + "data['exclamation_count'] = data['text'].str.count(r'!')\n", + "data['question_count'] = data['text'].str.count(r'\\?')\n", + "data['quote_count'] = data['text'].str.count(r'\"')\n", + "data['capital_ratio'] = data['text'].str.count(r'[A-Z]') / data['text_length']\n", + "\n", + "# Lexical diversity\n", + "data['unique_words_ratio'] = data['text_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "data['title_unique_words_ratio'] = data['title_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "\n", + "# Subject encoding\n", + "from sklearn.preprocessing import LabelEncoder\n", + "le = LabelEncoder()\n", + "data['subject_encoded'] = le.fit_transform(data['subject'])\n", + "subject_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n", + "DEFAULT_SUBJECT = 0\n", + "\n", + "# Date features\n", + "data['date_clean'] = data['date'].str.strip()\n", + "data['date_parsed'] = parse_dates_robust(data['date_clean'])\n", + "data['year'] = data['date_parsed'].dt.year\n", + "data['month'] = data['date_parsed'].dt.month\n", + "data['day_of_week'] = data['date_parsed'].dt.dayofweek\n", + "data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)\n", + "\n", + "numerical_features = [\n", + " 'text_length','title_length','word_count','title_word_count',\n", + " 'avg_word_length','title_avg_word_length',\n", + " 'exclamation_count','question_count','quote_count','capital_ratio',\n", + " 'unique_words_ratio','title_unique_words_ratio',\n", + " 'subject_encoded','year','month','day_of_week','is_weekend'\n", + "]\n", + "\n", + "print('Features ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vectorization and scaling complete\n" + ] + } + ], + "source": [ + "# Train/test split and vectorization\n", + "X_text = data['text_processed']\n", + "X_num = data[numerical_features].fillna(0)\n", + "y = data['label']\n", + "\n", + "X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(\n", + " X_text, X_num, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n", + "\n", + "# TF-IDF\n", + "tfidf_vectorizer = TfidfVectorizer(\n", + " max_features=8000,\n", + " ngram_range=(1,3),\n", + " min_df=3,\n", + " max_df=0.8,\n", + " sublinear_tf=True\n", + ")\n", + "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)\n", + "X_test_tfidf = tfidf_vectorizer.transform(X_test_text)\n", + "\n", + "# Scale numerical features\n", + "scaler = StandardScaler()\n", + "X_train_num_scaled = scaler.fit_transform(X_train_num)\n", + "X_test_num_scaled = scaler.transform(X_test_num)\n", + "\n", + "# Combine\n", + "from scipy.sparse import hstack\n", + "X_train_combined = hstack([X_train_tfidf, X_train_num_scaled])\n", + "X_test_combined = hstack([X_test_tfidf, X_test_num_scaled])\n", + "\n", + "print('Vectorization and scaling complete')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression: 0.9999\n", + "Random Forest: 1.0000\n", + "Gradient Boosting: 1.0000\n", + "SVM: 0.9996\n", + "Ensemble: 0.9999\n", + "\n", + "Best model: Random Forest (1.0000)\n" + ] + } + ], + "source": [ + "# Train models\n", + "lr_model = LogisticRegression(random_state=42, max_iter=2000, C=1.0)\n", + "rf_model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)\n", + "gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)\n", + "svm_model = SVC(random_state=42, probability=True, kernel='rbf')\n", + "ensemble_model = VotingClassifier(\n", + " estimators=[('lr', lr_model), ('rf', rf_model), ('gb', gb_model), ('svm', svm_model)],\n", + " voting='soft'\n", + ")\n", + "\n", + "models = {\n", + " 'Logistic Regression': lr_model,\n", + " 'Random Forest': rf_model,\n", + " 'Gradient Boosting': gb_model,\n", + " 'SVM': svm_model,\n", + " 'Ensemble': ensemble_model\n", + "}\n", + "\n", + "results = {}\n", + "for name, model in models.items():\n", + " model.fit(X_train_combined, y_train)\n", + " y_pred = model.predict(X_test_combined)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " results[name] = acc\n", + " print(f\"{name}: {acc:.4f}\")\n", + "\n", + "best_model_name = max(results, key=results.get)\n", + "best_model = models[best_model_name]\n", + "print(f\"\\nBest model: {best_model_name} ({results[best_model_name]:.4f})\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV scores: [0.99984353 0.99953059 0.99968706 0.99968701 0.99968701]\n", + "CV mean: 0.9996870403386608 +/- 0.0001979206855330111\n" + ] + } + ], + "source": [ + "# Cross-validation\n", + "cv_scores = cross_val_score(best_model, X_train_combined, y_train, cv=5)\n", + "print('CV scores:', cv_scores)\n", + "print('CV mean:', cv_scores.mean(), '+/-', cv_scores.std()*2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9998748278883465\n", + " precision recall f1-score support\n", + "\n", + " Fake 1.00 1.00 1.00 3989\n", + " Real 1.00 1.00 1.00 4000\n", + "\n", + " accuracy 1.00 7989\n", + " macro avg 1.00 1.00 1.00 7989\n", + "weighted avg 1.00 1.00 1.00 7989\n", + "\n" + ] + } + ], + "source": [ + "print('Accuracy:', accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred, target_names=['Fake','Real']))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9998748278883465\n", + " precision recall f1-score support\n", + "\n", + " Fake 1.00 1.00 1.00 3989\n", + " Real 1.00 1.00 1.00 4000\n", + "\n", + " accuracy 1.00 7989\n", + " macro avg 1.00 1.00 1.00 7989\n", + "weighted avg 1.00 1.00 1.00 7989\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Confusion matrix\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])\n", + "disp.plot(cmap=plt.cm.Blues)\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top text features:\n", + "like: 0.0081\n", + "know: 0.0092\n", + "minister: 0.0101\n", + "washington: 0.0114\n", + "image: 0.0173\n", + "image via: 0.0177\n", + "washington reuters: 0.0203\n", + "via: 0.0279\n", + "said: 0.0375\n", + "reuters: 0.1008\n", + "\n", + "Numerical features:\n", + "text_length: 0.0067\n", + "title_length: 0.0499\n", + "word_count: 0.0049\n", + "title_word_count: 0.0377\n", + "avg_word_length: 0.0096\n", + "title_avg_word_length: 0.0003\n", + "exclamation_count: 0.0145\n", + "question_count: 0.0145\n", + "quote_count: 0.0002\n", + "capital_ratio: 0.0016\n", + "unique_words_ratio: 0.0015\n", + "title_unique_words_ratio: 0.0013\n", + "subject_encoded: 0.0980\n", + "year: 0.0314\n", + "month: 0.0280\n", + "day_of_week: 0.0251\n", + "is_weekend: 0.0002\n" + ] + } + ], + "source": [ + "# Feature importance (if available)\n", + "if hasattr(best_model, 'feature_importances_'):\n", + " importances = best_model.feature_importances_\n", + " text_size = X_train_tfidf.shape[1]\n", + " text_imps = importances[:text_size]\n", + " top_idx = np.argsort(text_imps)[-10:]\n", + " top_feats = [tfidf_vectorizer.get_feature_names_out()[i] for i in top_idx]\n", + " print('Top text features:')\n", + " for f, w in zip(top_feats, text_imps[top_idx]):\n", + " print(f'{f}: {w:.4f}')\n", + " num_imps = importances[text_size:]\n", + " print('\\nNumerical features:')\n", + " for f, w in zip(numerical_features, num_imps):\n", + " print(f'{f}: {w:.4f}')\n", + "else:\n", + " print('Model has no feature_importances_ attribute')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlabelprobability_fakeprobability_realconfidence
0010.0715430.9284570.928457
1110.1784410.8215590.821559
2210.1284390.8715610.871561
3310.0538860.9461140.946114
4410.0772230.9227770.922777
\n", + "
" + ], + "text/plain": [ + " id label probability_fake probability_real confidence\n", + "0 0 1 0.071543 0.928457 0.928457\n", + "1 1 1 0.178441 0.821559 0.821559\n", + "2 2 1 0.128439 0.871561 0.871561\n", + "3 3 1 0.053886 0.946114 0.946114\n", + "4 4 1 0.077223 0.922777 0.922777" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict on validation data\n", + "val = validation_data.copy()\n", + "val['text_processed'] = val['text'].apply(preprocess_text)\n", + "val['title_processed'] = val['title'].apply(preprocess_text)\n", + "val['text_length'] = val['text'].str.len()\n", + "val['title_length'] = val['title'].str.len()\n", + "val['word_count'] = val['text'].str.split().str.len()\n", + "val['title_word_count'] = val['title'].str.split().str.len()\n", + "val['avg_word_length'] = val['text'].str.split().str.join(' ').str.len() / val['word_count']\n", + "val['title_avg_word_length'] = val['title'].str.split().str.join(' ').str.len() / val['title_word_count']\n", + "val['exclamation_count'] = val['text'].str.count(r'!')\n", + "val['question_count'] = val['text'].str.count(r'\\?')\n", + "val['quote_count'] = val['text'].str.count(r'\"')\n", + "val['capital_ratio'] = val['text'].str.count(r'[A-Z]') / val['text_length']\n", + "val['unique_words_ratio'] = val['text_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "val['title_unique_words_ratio'] = val['title_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "\n", + "# Subject encoding with unseen handling\n", + "val['subject_encoded'] = val['subject'].map(subject_mapping).fillna(DEFAULT_SUBJECT).astype(int)\n", + "\n", + "# Date features\n", + "val['date_clean'] = val['date'].str.strip()\n", + "val['date_parsed'] = parse_dates_robust(val['date_clean'])\n", + "val['year'] = val['date_parsed'].dt.year\n", + "val['month'] = val['date_parsed'].dt.month\n", + "val['day_of_week'] = val['date_parsed'].dt.dayofweek\n", + "val['is_weekend'] = val['day_of_week'].isin([5, 6]).astype(int)\n", + "\n", + "X_val_text = val['text_processed']\n", + "X_val_num = val[numerical_features].fillna(0)\n", + "X_val_num_scaled = scaler.transform(X_val_num)\n", + "X_val_tfidf = tfidf_vectorizer.transform(X_val_text)\n", + "X_val_combined = hstack([X_val_tfidf, X_val_num_scaled])\n", + "\n", + "val_pred = best_model.predict(X_val_combined)\n", + "val_proba = best_model.predict_proba(X_val_combined)\n", + "\n", + "submission = pd.DataFrame({\n", + " 'id': range(len(val)),\n", + " 'label': val_pred,\n", + " 'probability_fake': val_proba[:, 0],\n", + " 'probability_real': val_proba[:, 1],\n", + " 'confidence': np.max(val_proba, axis=1)\n", + "})\n", + "\n", + "submission.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved to enhanced_fake_news_predictions.csv\n", + "Distribution:\n", + "label\n", + "fake 0.702986\n", + "real 0.297014\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "# Save predictions\n", + "submission.to_csv('enhanced_fake_news_predictions.csv', index=False)\n", + "print(\"Saved to enhanced_fake_news_predictions.csv\")\n", + "print('Distribution:')\n", + "print(submission['label'].value_counts(normalize=True).rename({0:'fake',1:'real'}))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved labeled validation to: dataset/validation_data_labeled.csv\n" + ] + } + ], + "source": [ + "# Save validation with predicted labels into dataset\n", + "import os\n", + "\n", + "# Try to use in-memory validation data if present; otherwise read from disk\n", + "try:\n", + " original_val = validation_data.copy()\n", + "except NameError:\n", + " original_val = pd.read_csv('dataset/validation_data.csv')\n", + "\n", + "# Ensure lengths match\n", + "assert len(original_val) == len(submission), 'Length mismatch between validation and predictions'\n", + "\n", + "labeled_val = original_val.copy()\n", + "labeled_val['label'] = submission['label'].values\n", + "\n", + "out_path = os.path.join('dataset', 'validation_data_labeled.csv')\n", + "labeled_val.to_csv(out_path, index=False)\n", + "print(f\"Saved labeled validation to: {out_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.2 embeddings_classifier.ipynb b/2.2 embeddings_classifier.ipynb new file mode 100644 index 0000000..e6df594 --- /dev/null +++ b/2.2 embeddings_classifier.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sure! Here's the English translation:\n", + "\n", + "# Embeddings-based Fake News Classifier\n", + "\n", + "## Overview\n", + "This notebook builds a fake news classifier using sentence embeddings (Sentence-Transformers) and a simple linear classifier. Its structure is similar to `enhanced_classifier.ipynb`, but without complex text preprocessing.\n", + "\n", + "Pipeline:\n", + "- Installation and import of libraries \n", + "- Data loading \n", + "- Input text formation (title + text) \n", + "- Embedding generation (all-MiniLM-L6-v2) \n", + "- Train/test split \n", + "- Logistic Regression training \n", + "- Quality evaluation \n", + "- Prediction generation for validation and saving to CSV \n", + "\n", + "If you'd like, I can help you expand this into a full project description or even turn it into a presentation." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sergej/.pyenv/versions/3.11.8/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment ready\n" + ] + } + ], + "source": [ + "# Setup and imports\n", + "import sys\n", + "!{sys.executable} -m pip install -q sentence-transformers\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "\n", + "print('Environment ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(39942, 5) (4956, 5)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labeltitletextsubjectdate
01As U.S. budget fight looms, Republicans flip t...WASHINGTON (Reuters) - The head of a conservat...politicsNewsDecember 31, 2017
11U.S. military to accept transgender recruits o...WASHINGTON (Reuters) - Transgender people will...politicsNewsDecember 29, 2017
\n", + "
" + ], + "text/plain": [ + " label title \\\n", + "0 1 As U.S. budget fight looms, Republicans flip t... \n", + "1 1 U.S. military to accept transgender recruits o... \n", + "\n", + " text subject \\\n", + "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n", + "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n", + "\n", + " date \n", + "0 December 31, 2017 \n", + "1 December 29, 2017 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data\n", + "train_df = pd.read_csv('dataset/data.csv')\n", + "val_df = pd.read_csv('dataset/validation_data.csv')\n", + "print(train_df.shape, val_df.shape)\n", + "train_df.head(2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Samples: 39942 Labels: (39942,)\n", + "As U.S. budget fight looms, Republicans flip their fiscal script WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of\n" + ] + } + ], + "source": [ + "# Build input texts (title + text)\n", + "train_texts = (train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')).tolist()\n", + "val_texts = (val_df['title'].fillna('') + ' ' + val_df['text'].fillna('')).tolist()\n", + "y = train_df['label'].values\n", + "\n", + "print('Samples:', len(train_texts), 'Labels:', y.shape)\n", + "print(train_texts[0][:200])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 125/125 [04:39<00:00, 2.24s/it]\n", + "Batches: 100%|██████████| 32/32 [01:22<00:00, 2.59s/it]\n", + "Batches: 100%|██████████| 20/20 [00:55<00:00, 2.75s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "((31953, 384), (7989, 384), (4956, 384))" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Encode embeddings\n", + "embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", + "\n", + "# Train/test split indices\n", + "idx = np.arange(len(train_texts))\n", + "train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y)\n", + "\n", + "X_train = embedder.encode([train_texts[i] for i in train_idx], batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "X_test = embedder.encode([train_texts[i] for i in test_idx], batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "y_train = y[train_idx]\n", + "y_test = y[test_idx]\n", + "\n", + "X_val = embedder.encode(val_texts, batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "\n", + "X_train.shape, X_test.shape, X_val.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9480535736637877\n", + " precision recall f1-score support\n", + "\n", + " Fake 0.95 0.94 0.95 3989\n", + " Real 0.94 0.96 0.95 4000\n", + "\n", + " accuracy 0.95 7989\n", + " macro avg 0.95 0.95 0.95 7989\n", + "weighted avg 0.95 0.95 0.95 7989\n", + "\n" + ] + } + ], + "source": [ + "# Train classifier\n", + "clf = LogisticRegression(max_iter=2000)\n", + "clf.fit(X_train, y_train)\n", + "\n", + "y_pred = clf.predict(X_test)\n", + "print('Accuracy:', accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred, target_names=['Fake','Real']))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Confusion matrix\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])\n", + "disp.plot(cmap=plt.cm.Blues)\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LogReg accuracy: 0.9481\n", + "RandomForest accuracy: 0.9270\n", + "LinearSVM accuracy: 0.9567\n", + "\n", + "Best embeddings model: LinearSVM (0.9567)\n" + ] + } + ], + "source": [ + "# Compare models on embeddings\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "\n", + "rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)\n", + "svm = SVC(kernel='linear', probability=True, random_state=42)\n", + "\n", + "models = {\n", + " 'LogReg': clf, # already fitted above\n", + " 'RandomForest': rf,\n", + " 'LinearSVM': svm\n", + "}\n", + "\n", + "results = {}\n", + "preds = {}\n", + "probas = {}\n", + "\n", + "for name, m in models.items():\n", + " if name != 'LogReg':\n", + " m.fit(X_train, y_train)\n", + " y_pred = m.predict(X_test)\n", + " results[name] = accuracy_score(y_test, y_pred)\n", + " preds[name] = y_pred\n", + " if hasattr(m, 'predict_proba'):\n", + " probas[name] = m.predict_proba(X_test)\n", + " else:\n", + " # SVC without probas (if probability=False), but here True\n", + " probas[name] = None\n", + " print(f\"{name} accuracy: {results[name]:.4f}\")\n", + "\n", + "best_name = max(results, key=results.get)\n", + "best_model = models[best_name]\n", + "print(f\"\\nBest embeddings model: {best_name} ({results[best_name]:.4f})\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved to embeddings_fake_news_predictions_linearsvm.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlabelprobability_fakeprobability_real
0015.634485e-070.999999
1116.812508e-030.993187
2211.413487e-050.999986
3318.410532e-030.991589
4418.643167e-060.999991
\n", + "
" + ], + "text/plain": [ + " id label probability_fake probability_real\n", + "0 0 1 5.634485e-07 0.999999\n", + "1 1 1 6.812508e-03 0.993187\n", + "2 2 1 1.413487e-05 0.999986\n", + "3 3 1 8.410532e-03 0.991589\n", + "4 4 1 8.643167e-06 0.999991" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict on validation using best model and save\n", + "val_pred_best = best_model.predict(X_val)\n", + "val_proba_best = best_model.predict_proba(X_val) if hasattr(best_model, 'predict_proba') else None\n", + "\n", + "submission_best = pd.DataFrame({\n", + " 'id': range(len(val_df)),\n", + " 'label': val_pred_best\n", + "})\n", + "\n", + "if val_proba_best is not None:\n", + " submission_best['probability_fake'] = val_proba_best[:, 0]\n", + " submission_best['probability_real'] = val_proba_best[:, 1]\n", + "\n", + "out_path = f'embeddings_fake_news_predictions_{best_name.lower()}.csv'\n", + "submission_best.to_csv(out_path, index=False)\n", + "print(f'Saved to {out_path}')\n", + "submission_best.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved to embeddings_fake_news_predictions.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlabelprobability_fakeprobability_real
0010.0100440.989956
1110.0351590.964841
2210.0147080.985292
3310.0324490.967551
4410.0091780.990822
\n", + "
" + ], + "text/plain": [ + " id label probability_fake probability_real\n", + "0 0 1 0.010044 0.989956\n", + "1 1 1 0.035159 0.964841\n", + "2 2 1 0.014708 0.985292\n", + "3 3 1 0.032449 0.967551\n", + "4 4 1 0.009178 0.990822" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict on validation and save\n", + "val_pred = clf.predict(X_val)\n", + "val_proba = clf.predict_proba(X_val)\n", + "\n", + "submission = pd.DataFrame({\n", + " 'id': range(len(val_df)),\n", + " 'label': val_pred,\n", + " 'probability_fake': val_proba[:, 0],\n", + " 'probability_real': val_proba[:, 1]\n", + "})\n", + "\n", + "submission.to_csv('embeddings_fake_news_predictions.csv', index=False)\n", + "print('Saved to embeddings_fake_news_predictions.csv')\n", + "submission.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved labeled validation to: dataset/validation_data_labeled_embeddings.csv\n" + ] + } + ], + "source": [ + "# Save validation with predicted labels into dataset\n", + "import os\n", + "\n", + "# Try to use in-memory validation data if present; otherwise read from disk\n", + "try:\n", + " original_val = validation_data.copy()\n", + "except NameError:\n", + " original_val = pd.read_csv('dataset/validation_data.csv')\n", + "\n", + "# Ensure lengths match\n", + "assert len(original_val) == len(submission), 'Length mismatch between validation and predictions'\n", + "\n", + "labeled_val = original_val.copy()\n", + "labeled_val['label'] = submission['label'].values\n", + "\n", + "out_path = os.path.join('dataset', 'validation_data_labeled_embeddings.csv')\n", + "labeled_val.to_csv(out_path, index=False)\n", + "print(f\"Saved labeled validation to: {out_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.3 embeddings_advanced_classifier.ipynb b/2.3 embeddings_advanced_classifier.ipynb new file mode 100644 index 0000000..a7d4176 --- /dev/null +++ b/2.3 embeddings_advanced_classifier.ipynb @@ -0,0 +1,629 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced Embeddings-based Classifier\n", + "\n", + "## Goals\n", + "- Stronger embeddings (`all-mpnet-base-v2`) + baseline (`all-MiniLM-L6-v2`)\n", + "- Tune LogReg / Linear SVM (C)\n", + "- Add simple MLP head over embeddings\n", + "- Baseline TF-IDF + LR and soft ensemble with embeddings\n", + "- Predict validation and save CSV\n", + "\n", + "Run top-to-bottom.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sergej/.pyenv/versions/3.11.8/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setup complete\n" + ] + } + ], + "source": [ + "# Setup\n", + "import sys\n", + "!{sys.executable} -m pip install -q sentence-transformers scikit-learn\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "print('Setup complete')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(31953, 7989, 4956)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data and build texts\n", + "train_df = pd.read_csv('dataset/data.csv')\n", + "val_df = pd.read_csv('dataset/validation_data.csv')\n", + "\n", + "train_texts = (train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')).tolist()\n", + "val_texts = (val_df['title'].fillna('') + ' ' + val_df['text'].fillna('')).tolist()\n", + "y = train_df['label'].values\n", + "\n", + "idx = np.arange(len(train_texts))\n", + "train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y)\n", + "\n", + "y_train, y_test = y[train_idx], y[test_idx]\n", + "texts_train = [train_texts[i] for i in train_idx]\n", + "texts_test = [train_texts[i] for i in test_idx]\n", + "\n", + "len(texts_train), len(texts_test), len(val_texts)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 125/125 [04:25<00:00, 2.12s/it]\n", + "Batches: 100%|██████████| 32/32 [01:18<00:00, 2.44s/it]\n", + "Batches: 100%|██████████| 20/20 [00:47<00:00, 2.39s/it]\n", + "Batches: 100%|██████████| 500/500 [51:53<00:00, 6.23s/it] \n", + "Batches: 100%|██████████| 125/125 [14:08<00:00, 6.79s/it]\n", + "Batches: 100%|██████████| 78/78 [12:07<00:00, 9.33s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "((31953, 384), (31953, 768))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Encode embeddings (MiniLM + MPNet)\n", + "mini = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", + "mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n", + "\n", + "X_train_mini = mini.encode(texts_train, batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "X_test_mini = mini.encode(texts_test, batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "X_val_mini = mini.encode(val_texts, batch_size=256, convert_to_numpy=True, show_progress_bar=True)\n", + "\n", + "X_train_mp = mpnet.encode(texts_train, batch_size=64, convert_to_numpy=True, show_progress_bar=True)\n", + "X_test_mp = mpnet.encode(texts_test, batch_size=64, convert_to_numpy=True, show_progress_bar=True)\n", + "X_val_mp = mpnet.encode(val_texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True)\n", + "\n", + "X_train_mini.shape, X_train_mp.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MiniLM | LR acc=0.9569 (C=5), SVM acc=0.9597 (C=5)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MPNet | LR acc=0.9740 (C=5), SVM acc=0.9795 (C=5)\n" + ] + }, + { + "data": { + "text/plain": [ + "{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MiniLM_SVM': (0.9596945800475654,\n", + " SVC(C=5, kernel='linear', probability=True)),\n", + " 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MPNet_SVM': (0.9794717736888221,\n", + " SVC(C=5, kernel='linear', probability=True))}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tune LogReg and Linear SVM on embeddings\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "results = {}\n", + "\n", + "def fit_and_eval(X_train, X_test, name):\n", + " # Logistic Regression\n", + " lr = GridSearchCV(LogisticRegression(max_iter=3000),\n", + " param_grid={'C':[0.1, 0.5, 1, 2, 5]},\n", + " cv=5, n_jobs=-1)\n", + " lr.fit(X_train, y_train)\n", + " pred_lr = lr.predict(X_test)\n", + " acc_lr = accuracy_score(y_test, pred_lr)\n", + "\n", + " # Linear SVM (probability=True для совместимости ансамбля)\n", + " svm = GridSearchCV(SVC(kernel='linear', probability=True),\n", + " param_grid={'C':[0.1, 0.5, 1, 2, 5]},\n", + " cv=5, n_jobs=-1)\n", + " svm.fit(X_train, y_train)\n", + " pred_svm = svm.predict(X_test)\n", + " acc_svm = accuracy_score(y_test, pred_svm)\n", + "\n", + " results[f'{name}_LR'] = (acc_lr, lr.best_estimator_)\n", + " results[f'{name}_SVM'] = (acc_svm, svm.best_estimator_)\n", + " print(f'{name} | LR acc={acc_lr:.4f} (C={lr.best_params_[\"C\"]}), SVM acc={acc_svm:.4f} (C={svm.best_params_[\"C\"]})')\n", + "\n", + "fit_and_eval(X_train_mini, X_test_mini, 'MiniLM')\n", + "fit_and_eval(X_train_mp, X_test_mp, 'MPNet')\n", + "\n", + "results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MLP (MPNet) acc: 0.9833521091500813\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sergej/.pyenv/versions/3.11.8/lib/python3.11/site-packages/sklearn/neural_network/_multilayer_perceptron.py:781: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (30) reached and the optimization hasn't converged yet.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MiniLM_SVM': (0.9596945800475654,\n", + " SVC(C=5, kernel='linear', probability=True)),\n", + " 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MPNet_SVM': (0.9794717736888221,\n", + " SVC(C=5, kernel='linear', probability=True)),\n", + " 'MPNet_MLP': (0.9833521091500813,\n", + " MLPClassifier(batch_size=256, hidden_layer_sizes=(256,), max_iter=30,\n", + " random_state=42))}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# MLP head over embeddings (MPNet)\n", + "mlp = MLPClassifier(hidden_layer_sizes=(256,), activation='relu', solver='adam',\n", + " alpha=1e-4, batch_size=256, max_iter=30, random_state=42)\n", + "mlp.fit(X_train_mp, y_train)\n", + "\n", + "pred_mlp = mlp.predict(X_test_mp)\n", + "acc_mlp = accuracy_score(y_test, pred_mlp)\n", + "print('MLP (MPNet) acc:', acc_mlp)\n", + "\n", + "results['MPNet_MLP'] = (acc_mlp, mlp)\n", + "results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TF-IDF+LR acc: 0.9888596820628364\n", + "Best embeddings model: TFIDF_LR acc=0.9889\n", + "Soft ensemble acc: 0.9888596820628364\n" + ] + }, + { + "data": { + "text/plain": [ + "{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MiniLM_SVM': (0.9596945800475654,\n", + " SVC(C=5, kernel='linear', probability=True)),\n", + " 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),\n", + " 'MPNet_SVM': (0.9794717736888221,\n", + " SVC(C=5, kernel='linear', probability=True)),\n", + " 'MPNet_MLP': (0.9833521091500813,\n", + " MLPClassifier(batch_size=256, hidden_layer_sizes=(256,), max_iter=30,\n", + " random_state=42)),\n", + " 'TFIDF_LR': (0.9888596820628364,\n", + " (TfidfVectorizer(max_df=0.8, max_features=5000, min_df=3, ngram_range=(1, 2)),\n", + " LogisticRegression(max_iter=3000))),\n", + " 'SOFT_ENSEMBLE': (0.9888596820628364, ('ensemble', 'TFIDF_LR'))}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TF-IDF + LR baseline and soft ensemble with best embedding model\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# TF-IDF baseline (на train/test)\n", + "vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8)\n", + "X_train_tf = vectorizer.fit_transform(texts_train)\n", + "X_test_tf = vectorizer.transform(texts_test)\n", + "\n", + "lr_tf = LogisticRegression(max_iter=3000)\n", + "lr_tf.fit(X_train_tf, y_train)\n", + "\n", + "pred_tf = lr_tf.predict(X_test_tf)\n", + "acc_tf = accuracy_score(y_test, pred_tf)\n", + "print('TF-IDF+LR acc:', acc_tf)\n", + "\n", + "results['TFIDF_LR'] = (acc_tf, (vectorizer, lr_tf))\n", + "\n", + "# Выбор лучшей embeddings-модели\n", + "best_name = max(results, key=lambda k: results[k][0])\n", + "best_acc, best_est = results[best_name]\n", + "print(f'Best embeddings model: {best_name} acc={best_acc:.4f}')\n", + "\n", + "# Подготовим вероятности для soft-ensemble (если модель умеет probas)\n", + "def get_proba(est, X):\n", + " if isinstance(est, LogisticRegression) or isinstance(est, SVC):\n", + " return est.predict_proba(X)\n", + " if isinstance(est, MLPClassifier):\n", + " return est.predict_proba(X)\n", + " # (vectorizer, lr) пара\n", + " if isinstance(est, tuple) and isinstance(est[1], LogisticRegression):\n", + " vec, lrm = est\n", + " return lrm.predict_proba(vec.transform(texts_test))\n", + " return None\n", + "\n", + "'''\n", + "# Вероятности\n", + "# Для embeddings лучшую модель применим на соответствующем X_test\n", + "if 'MPNet' in best_name:\n", + " proba_emb = best_est.predict_proba(X_test_mp)\n", + "else:\n", + " proba_emb = best_est.predict_proba(X_test_mini)\n", + "'''\n", + "\n", + "if 'MPNet' in best_name:\n", + " proba_emb = get_proba(best_est, X_test_mp)\n", + "else:\n", + " proba_emb = get_proba(best_est, X_test_mini)\n", + "\n", + "\n", + "\n", + "proba_tf = lr_tf.predict_proba(X_test_tf)\n", + "\n", + "# Soft-average ансамбль\n", + "proba_ens = 0.5 * proba_emb + 0.5 * proba_tf\n", + "pred_ens = np.argmax(proba_ens, axis=1)\n", + "acc_ens = accuracy_score(y_test, pred_ens)\n", + "print('Soft ensemble acc:', acc_ens)\n", + "\n", + "results['SOFT_ENSEMBLE'] = (acc_ens, ('ensemble', best_name))\n", + "results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best overall: TFIDF_LR 0.9888596820628364\n", + "Saved to advanced_embeddings_predictions_TFIDF_LR.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlabelprobability_fakeprobability_real
0000.8154720.184528
1100.7539690.246031
2210.4956470.504353
3300.5369330.463067
4400.7078620.292138
\n", + "
" + ], + "text/plain": [ + " id label probability_fake probability_real\n", + "0 0 0 0.815472 0.184528\n", + "1 1 0 0.753969 0.246031\n", + "2 2 1 0.495647 0.504353\n", + "3 3 0 0.536933 0.463067\n", + "4 4 0 0.707862 0.292138" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Pick best overall and predict validation\n", + "best_overall = max(results, key=lambda k: results[k][0])\n", + "best_overall_acc, best_overall_est = results[best_overall]\n", + "print('Best overall:', best_overall, best_overall_acc)\n", + "\n", + "# Build validation predictions\n", + "if best_overall.startswith('MiniLM_'):\n", + " proba_val = best_overall_est.predict_proba(X_val_mini)\n", + "elif best_overall.startswith('MPNet_'):\n", + " proba_val = best_overall_est.predict_proba(X_val_mp)\n", + "elif best_overall == 'MPNet_MLP':\n", + " proba_val = best_overall_est.predict_proba(X_val_mp)\n", + "elif best_overall == 'TFIDF_LR':\n", + " proba_val = best_overall_est[1].predict_proba(TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8).fit(train_texts).transform(val_texts))\n", + "elif best_overall == 'SOFT_ENSEMBLE':\n", + " # ensemble = 0.5 * best embedding + 0.5 * TF-IDF LR\n", + " if 'MPNet' in best_name:\n", + " proba_emb_val = best_est.predict_proba(X_val_mp)\n", + " else:\n", + " proba_emb_val = best_est.predict_proba(X_val_mini)\n", + " proba_tf_val = lr_tf.predict_proba(TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8).fit(train_texts).transform(val_texts))\n", + " proba_val = 0.5 * proba_emb_val + 0.5 * proba_tf_val\n", + "else:\n", + " raise ValueError('Unknown best model key')\n", + "\n", + "pred_val = np.argmax(proba_val, axis=1)\n", + "sub = pd.DataFrame({\n", + " 'id': range(len(val_df)),\n", + " 'label': pred_val,\n", + " 'probability_fake': proba_val[:,0],\n", + " 'probability_real': proba_val[:,1]\n", + "})\n", + "\n", + "out_path = f'advanced_embeddings_predictions_{best_overall}.csv'\n", + "sub.to_csv(out_path, index=False)\n", + "print('Saved to', out_path)\n", + "sub.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved labeled validation to: dataset/validation_data_labeled_embeddings_advanced.csv\n" + ] + } + ], + "source": [ + "# Save validation with predicted labels into dataset\n", + "import os\n", + "\n", + "# Try to use in-memory validation data if present; otherwise read from disk\n", + "try:\n", + " original_val = val_df.copy()\n", + "except NameError:\n", + " original_val = pd.read_csv('dataset/validation_data.csv')\n", + "\n", + "# Ensure lengths match\n", + "assert len(original_val) == len(sub), 'Length mismatch between validation and predictions'\n", + "\n", + "labeled_val = original_val.copy()\n", + "labeled_val['label'] = sub['label'].values\n", + "\n", + "out_path = os.path.join('dataset', 'validation_data_labeled_embeddings_advanced.csv')\n", + "labeled_val.to_csv(out_path, index=False)\n", + "print(f\"Saved labeled validation to: {out_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.4 enhanced_classifier_minus_reuters.ipynb b/2.4 enhanced_classifier_minus_reuters.ipynb new file mode 100644 index 0000000..60126aa --- /dev/null +++ b/2.4 enhanced_classifier_minus_reuters.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Enhanced Fake News Classifier\n", + "\n", + "## Overview\n", + "This notebook implements an enhanced fake news classifier with:\n", + "- Robust text preprocessing (cleaning, stopword removal, lemmatization)\n", + "- Advanced feature engineering (length, punctuation, lexical diversity, subject, dates)\n", + "- TF-IDF vectorization and numerical feature scaling\n", + "- Multiple models (Logistic Regression, Random Forest, Gradient Boosting, SVM)\n", + "- Soft-voting ensemble and cross-validation\n", + "- Predictions on validation set with confidence scores\n", + "\n", + "Run cells top-to-bottom.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setup complete\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /Users/sergej/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "# Imports and setup\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Text processing\n", + "import re\n", + "from collections import Counter\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# ML\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "from sklearn.preprocessing import StandardScaler\n", + "from scipy.sparse import hstack\n", + "\n", + "# NLTK data\n", + "for pkg, path in [('punkt','tokenizers/punkt'),('stopwords','corpora/stopwords'),('wordnet','corpora/wordnet')]:\n", + " try:\n", + " nltk.data.find(path)\n", + " except LookupError:\n", + " nltk.download(pkg)\n", + "\n", + "print('Setup complete')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data...\n", + "Training: (39942, 10), Validation: (4956, 10)\n" + ] + } + ], + "source": [ + "# Load data\n", + "print('Loading data...')\n", + "train_data = pd.read_csv('dataset/processed_training_data.csv')\n", + "validation_data = pd.read_csv('dataset/processed_validation_data.csv')\n", + "print(f'Training: {train_data.shape}, Validation: {validation_data.shape}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = train_data.drop('subject', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preprocessing functions ready\n" + ] + } + ], + "source": [ + "# Preprocessing functions\n", + "def clean_text(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " text = text.lower()\n", + " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", + " text = ' '.join(text.split())\n", + " return text\n", + "\n", + "def remove_stopwords(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " \n", + " # remove reuters news\n", + " try:\n", + " text = re.sub(r'\\(\\s*reuters\\s*\\)', '', text, flags=re.IGNORECASE)\n", + " except Exception as e:\n", + " pass\n", + "\n", + "\n", + " try:\n", + " stop_words = set(stopwords.words('english'))\n", + " stop_words.add('reuters') \n", + " \n", + " text = re.sub(r'\\breuters\\b', '', text, flags=re.IGNORECASE)\n", + " words = word_tokenize(text)\n", + " return ' '.join([w for w in words if w.lower() not in stop_words])\n", + " except:\n", + " return text\n", + "\n", + "def lemmatize_text(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " try:\n", + " lemmatizer = WordNetLemmatizer()\n", + " words = word_tokenize(text)\n", + " return ' '.join([lemmatizer.lemmatize(w) for w in words])\n", + " except:\n", + " return text\n", + "\n", + "def preprocess_text(text):\n", + " text = clean_text(text)\n", + " text = remove_stopwords(text)\n", + " text = lemmatize_text(text)\n", + " return text\n", + "\n", + "print('Preprocessing functions ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Date parser ready\n" + ] + } + ], + "source": [ + "# Advanced date parsing\n", + "\n", + "def parse_dates_robust(date_series):\n", + " formats = ['%B %d, %Y', '%d-%b-%y', '%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y']\n", + " parsed_dates = pd.Series([pd.NaT] * len(date_series), index=date_series.index)\n", + " for fmt in formats:\n", + " try:\n", + " temp = pd.to_datetime(date_series, format=fmt, errors='coerce')\n", + " mask = parsed_dates.isna() & temp.notna()\n", + " parsed_dates[mask] = temp[mask]\n", + " except:\n", + " continue\n", + " return parsed_dates\n", + "\n", + "print('Date parser ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Engineering features...\n", + "Features ready\n" + ] + } + ], + "source": [ + "# Feature engineering\n", + "print('Engineering features...')\n", + "\n", + "# Process text\n", + "data = train_data.copy()\n", + "data['text_processed'] = data['text'].apply(preprocess_text)\n", + "data['title_processed'] = data['title'].apply(preprocess_text)\n", + "\n", + "# Basic lengths\n", + "data['text_length'] = data['text'].str.len()\n", + "data['title_length'] = data['title'].str.len()\n", + "data['word_count'] = data['text'].str.split().str.len()\n", + "data['title_word_count'] = data['title'].str.split().str.len()\n", + "\n", + "# Advanced text features\n", + "data['avg_word_length'] = data['text'].str.split().str.join(' ').str.len() / data['word_count']\n", + "data['title_avg_word_length'] = data['title'].str.split().str.join(' ').str.len() / data['title_word_count']\n", + "\n", + "# Punctuation features (escape regex)\n", + "data['exclamation_count'] = data['text'].str.count(r'!')\n", + "data['question_count'] = data['text'].str.count(r'\\?')\n", + "data['quote_count'] = data['text'].str.count(r'\"')\n", + "data['capital_ratio'] = data['text'].str.count(r'[A-Z]') / data['text_length']\n", + "\n", + "# Lexical diversity\n", + "data['unique_words_ratio'] = data['text_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "data['title_unique_words_ratio'] = data['title_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "\n", + "# Subject encoding\n", + "'''\n", + "from sklearn.preprocessing import LabelEncoder\n", + "le = LabelEncoder()\n", + "data['subject_encoded'] = le.fit_transform(data['subject'])\n", + "subject_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n", + "DEFAULT_SUBJECT = 0\n", + "\n", + "''' \n", + "\n", + "\n", + "# Date features\n", + "data['date_clean'] = data['date'].str.strip()\n", + "data['date_parsed'] = parse_dates_robust(data['date_clean'])\n", + "data['year'] = data['date_parsed'].dt.year\n", + "data['month'] = data['date_parsed'].dt.month\n", + "data['day_of_week'] = data['date_parsed'].dt.dayofweek\n", + "data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)\n", + "\n", + "numerical_features = [\n", + " 'text_length','title_length', # 'word_count','title_word_count',\n", + " 'avg_word_length','title_avg_word_length',\n", + " 'exclamation_count','question_count','quote_count','capital_ratio',\n", + " 'unique_words_ratio','title_unique_words_ratio', # 'subject_encoded',\n", + " 'day_of_week','is_weekend' #'year','month'\n", + "]\n", + "\n", + "print('Features ready')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vectorization and scaling complete\n" + ] + } + ], + "source": [ + "# Train/test split and vectorization\n", + "X_text = data['text_processed']\n", + "X_num = data[numerical_features].fillna(0)\n", + "y = data['label']\n", + "\n", + "X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(\n", + " X_text, X_num, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n", + "\n", + "# TF-IDF\n", + "tfidf_vectorizer = TfidfVectorizer(\n", + " max_features=8000,\n", + " ngram_range=(1,3),\n", + " min_df=3,\n", + " max_df=0.8,\n", + " sublinear_tf=True\n", + ")\n", + "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)\n", + "X_test_tfidf = tfidf_vectorizer.transform(X_test_text)\n", + "\n", + "# Scale numerical features\n", + "scaler = StandardScaler()\n", + "X_train_num_scaled = scaler.fit_transform(X_train_num)\n", + "X_test_num_scaled = scaler.transform(X_test_num)\n", + "\n", + "# Combine\n", + "from scipy.sparse import hstack\n", + "X_train_combined = hstack([X_train_tfidf, X_train_num_scaled])\n", + "X_test_combined = hstack([X_test_tfidf, X_test_num_scaled])\n", + "\n", + "print('Vectorization and scaling complete')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest: 0.9875\n", + "\n", + "Best model: Random Forest (0.9875)\n" + ] + } + ], + "source": [ + "# Train models\n", + "lr_model = LogisticRegression(random_state=42, max_iter=2000, C=1.0)\n", + "rf_model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)\n", + "gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)\n", + "svm_model = SVC(random_state=42, probability=True, kernel='rbf')\n", + "ensemble_model = VotingClassifier(\n", + " estimators=[('lr', lr_model), ('rf', rf_model), ('gb', gb_model), ('svm', svm_model)],\n", + " voting='soft'\n", + ")\n", + "\n", + "models = {\n", + " # 'Logistic Regression': lr_model,\n", + " 'Random Forest': rf_model\n", + " # , 'Gradient Boosting': gb_model\n", + " # , 'SVM': svm_model \n", + " #, 'Ensemble': ensemble_model\n", + "}\n", + "\n", + "results = {}\n", + "for name, model in models.items():\n", + " model.fit(X_train_combined, y_train)\n", + " y_pred = model.predict(X_test_combined)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " results[name] = acc\n", + " print(f\"{name}: {acc:.4f}\")\n", + "\n", + "best_model_name = max(results, key=results.get)\n", + "best_model = models[best_model_name]\n", + "print(f\"\\nBest model: {best_model_name} ({results[best_model_name]:.4f})\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV scores: [0.98857769 0.98967298 0.9906118 0.99045383 0.9885759 ]\n", + "CV mean: 0.9895784393595356 +/- 0.0017549183989109174\n" + ] + } + ], + "source": [ + "# Cross-validation\n", + "cv_scores = cross_val_score(best_model, X_train_combined, y_train, cv=5)\n", + "print('CV scores:', cv_scores)\n", + "print('CV mean:', cv_scores.mean(), '+/-', cv_scores.std()*2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Evaluation of the best model (Random Forest):\n", + "Accuracy: 0.9874827888346477\n", + " precision recall f1-score support\n", + "\n", + " Fake 0.99 0.98 0.99 3989\n", + " Real 0.98 0.99 0.99 4000\n", + "\n", + " accuracy 0.99 7989\n", + " macro avg 0.99 0.99 0.99 7989\n", + "weighted avg 0.99 0.99 0.99 7989\n", + "\n" + ] + } + ], + "source": [ + "y_pred_best = best_model.predict(X_test_combined)\n", + "\n", + "print(f\"\\nEvaluation of the best model ({best_model_name}):\")\n", + "print('Accuracy:', accuracy_score(y_test, y_pred_best))\n", + "print(classification_report(y_test, y_pred_best, target_names=['Fake','Real']))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Confusion matrix\n", + "cm = confusion_matrix(y_test, y_pred_best)\n", + "\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])\n", + "disp.plot(cmap=plt.cm.Blues)\n", + "plt.title(f'Confusion Matrix for {best_model_name}')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top text features:\n", + "video: 0.0099\n", + "president donald trump: 0.0115\n", + "president donald: 0.0130\n", + "like: 0.0140\n", + "washington: 0.0143\n", + "minister: 0.0149\n", + "image via: 0.0180\n", + "image: 0.0304\n", + "via: 0.0327\n", + "said: 0.0350\n", + "\n", + "Numerical features:\n", + "text_length: 0.0114\n", + "title_length: 0.0685\n", + "avg_word_length: 0.0183\n", + "title_avg_word_length: 0.0005\n", + "exclamation_count: 0.0185\n", + "question_count: 0.0278\n", + "quote_count: 0.0007\n", + "capital_ratio: 0.0035\n", + "unique_words_ratio: 0.0028\n", + "title_unique_words_ratio: 0.0028\n", + "day_of_week: 0.0343\n", + "is_weekend: 0.0003\n" + ] + } + ], + "source": [ + "# Feature importance (if available)\n", + "if hasattr(best_model, 'feature_importances_'):\n", + " importances = best_model.feature_importances_\n", + " text_size = X_train_tfidf.shape[1]\n", + " text_imps = importances[:text_size]\n", + " top_idx = np.argsort(text_imps)[-10:]\n", + " top_feats = [tfidf_vectorizer.get_feature_names_out()[i] for i in top_idx]\n", + " print('Top text features:')\n", + " for f, w in zip(top_feats, text_imps[top_idx]):\n", + " print(f'{f}: {w:.4f}')\n", + " num_imps = importances[text_size:]\n", + " print('\\nNumerical features:')\n", + " for f, w in zip(numerical_features, num_imps):\n", + " print(f'{f}: {w:.4f}')\n", + "else:\n", + " print('Model has no feature_importances_ attribute')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlabelprobability_fakeprobability_realconfidence
0010.1689780.8310220.831022
1110.3765230.6234770.623477
2210.1765690.8234310.823431
3310.1079700.8920300.892030
4410.1792640.8207360.820736
\n", + "
" + ], + "text/plain": [ + " id label probability_fake probability_real confidence\n", + "0 0 1 0.168978 0.831022 0.831022\n", + "1 1 1 0.376523 0.623477 0.623477\n", + "2 2 1 0.176569 0.823431 0.823431\n", + "3 3 1 0.107970 0.892030 0.892030\n", + "4 4 1 0.179264 0.820736 0.820736" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predict on validation data\n", + "val = validation_data.copy()\n", + "val['text_processed'] = val['text'].apply(preprocess_text)\n", + "val['title_processed'] = val['title'].apply(preprocess_text)\n", + "val['text_length'] = val['text'].str.len()\n", + "val['title_length'] = val['title'].str.len()\n", + "val['word_count'] = val['text'].str.split().str.len()\n", + "val['title_word_count'] = val['title'].str.split().str.len()\n", + "val['avg_word_length'] = val['text'].str.split().str.join(' ').str.len() / val['word_count']\n", + "val['title_avg_word_length'] = val['title'].str.split().str.join(' ').str.len() / val['title_word_count']\n", + "val['exclamation_count'] = val['text'].str.count(r'!')\n", + "val['question_count'] = val['text'].str.count(r'\\?')\n", + "val['quote_count'] = val['text'].str.count(r'\"')\n", + "val['capital_ratio'] = val['text'].str.count(r'[A-Z]') / val['text_length']\n", + "val['unique_words_ratio'] = val['text_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "val['title_unique_words_ratio'] = val['title_processed'].str.split().apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)\n", + "\n", + "# Subject encoding with unseen handling\n", + "# val['subject_encoded'] = val['subject'].map(subject_mapping).fillna(DEFAULT_SUBJECT).astype(int)\n", + "\n", + "# Date features\n", + "val['date_clean'] = val['date'].str.strip()\n", + "val['date_parsed'] = parse_dates_robust(val['date_clean'])\n", + "#val['year'] = val['date_parsed'].dt.year\n", + "#val['month'] = val['date_parsed'].dt.month\n", + "val['day_of_week'] = val['date_parsed'].dt.dayofweek\n", + "val['is_weekend'] = val['day_of_week'].isin([5, 6]).astype(int)\n", + "\n", + "X_val_text = val['text_processed']\n", + "X_val_num = val[numerical_features].fillna(0)\n", + "X_val_num_scaled = scaler.transform(X_val_num)\n", + "X_val_tfidf = tfidf_vectorizer.transform(X_val_text)\n", + "X_val_combined = hstack([X_val_tfidf, X_val_num_scaled])\n", + "\n", + "val_pred = best_model.predict(X_val_combined)\n", + "val_proba = best_model.predict_proba(X_val_combined)\n", + "\n", + "submission = pd.DataFrame({\n", + " 'id': range(len(val)),\n", + " 'label': val_pred,\n", + " 'probability_fake': val_proba[:, 0],\n", + " 'probability_real': val_proba[:, 1],\n", + " 'confidence': np.max(val_proba, axis=1)\n", + "})\n", + "\n", + "submission.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved to enhanced_fake_news_predictions.csv\n", + "Distribution:\n", + "label\n", + "fake 0.64427\n", + "real 0.35573\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "# Save predictions\n", + "submission.to_csv('enhanced_fake_news_predictions.csv', index=False)\n", + "print(\"Saved to enhanced_fake_news_predictions.csv\")\n", + "print('Distribution:')\n", + "print(submission['label'].value_counts(normalize=True).rename({0:'fake',1:'real'}))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved labeled validation to: dataset/validation_data_labeled_minus_reuters.csv\n" + ] + } + ], + "source": [ + "# Save validation with predicted labels into dataset\n", + "import os\n", + "\n", + "# Try to use in-memory validation data if present; otherwise read from disk\n", + "try:\n", + " original_val = validation_data.copy()\n", + "except NameError:\n", + " original_val = pd.read_csv('dataset/validation_data.csv')\n", + "\n", + "# Ensure lengths match\n", + "assert len(original_val) == len(submission), 'Length mismatch between validation and predictions'\n", + "\n", + "labeled_val = original_val.copy()\n", + "labeled_val['label'] = submission['label'].values\n", + "\n", + "out_path = os.path.join('dataset', 'validation_data_labeled_minus_reuters.csv')\n", + "labeled_val.to_csv(out_path, index=False)\n", + "print(f\"Saved labeled validation to: {out_path}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.5 Final_classifier_and_XGBoost.ipynb b/2.5 Final_classifier_and_XGBoost.ipynb new file mode 100644 index 0000000..d19b848 --- /dev/null +++ b/2.5 Final_classifier_and_XGBoost.ipynb @@ -0,0 +1,1311 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comprehensive Fake News Classifier with Validation\n", + "\n", + "This notebook contains a complete data analysis, training of various machine learning models for fake news classification, and validation on a separate dataset.\n", + "\n", + "## Work Plan:\n", + "1. Data loading and duplicate removal\n", + "2. NLP preprocessing with creation of additional features (including bracket difference)\n", + "3. Vectorization using TF-IDF\n", + "4. Train/test split\n", + "5. Model training and comparison (RandomForest, XGBoost)\n", + "6. Best model selection and saving\n", + "7. Training on full dataset\n", + "8. Feature importance analysis\n", + "9. Predictions on validation data and saving results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Libraries successfully imported!\n" + ] + } + ], + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import re\n", + "import string\n", + "from collections import Counter\n", + "import joblib\n", + "import pickle\n", + "\n", + "# NLP libraries\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# Machine learning models\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "import xgboost as xgb\n", + "\n", + "# Additional libraries\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Download stopwords\n", + "try:\n", + " nltk.data.find('tokenizers/punkt')\n", + "except LookupError:\n", + " nltk.download('punkt')\n", + "\n", + "try:\n", + " nltk.data.find('corpora/stopwords')\n", + "except LookupError:\n", + " nltk.download('stopwords')\n", + "\n", + "print(\"Libraries successfully imported!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Loading and Duplicate Removal\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data...\n", + "Main dataset (data.csv): (39942, 5)\n", + "Columns: ['label', 'title', 'text', 'subject', 'date']\n", + "Validation dataset: (4956, 5)\n", + "Columns: ['label', 'title', 'text', 'subject', 'date']\n", + "\n", + "Duplicates in main dataset: 201\n", + "Main dataset size after removing duplicates: (39741, 5)\n", + "✅ train_data has all required columns!\n", + "✅ validation_data has all required columns!\n", + "\n", + "First 3 rows of main dataset:\n", + " label title \\\n", + "0 1 As U.S. budget fight looms, Republicans flip t... \n", + "1 1 U.S. military to accept transgender recruits o... \n", + "2 1 Senior U.S. Republican senator: 'Let Mr. Muell... \n", + "\n", + " text subject \\\n", + "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n", + "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n", + "2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n", + "\n", + " date \n", + "0 December 31, 2017 \n", + "1 December 29, 2017 \n", + "2 December 31, 2017 \n", + "\n", + "First 3 rows of validation dataset:\n", + " label title \\\n", + "0 2 UK's May 'receiving regular updates' on London... \n", + "1 2 UK transport police leading investigation of L... \n", + "2 2 Pacific nations crack down on North Korean shi... \n", + "\n", + " text subject \\\n", + "0 LONDON (Reuters) - British Prime Minister Ther... worldnews \n", + "1 LONDON (Reuters) - British counter-terrorism p... worldnews \n", + "2 WELLINGTON (Reuters) - South Pacific island na... worldnews \n", + "\n", + " date \n", + "0 September 15, 2017 \n", + "1 September 15, 2017 \n", + "2 September 15, 2017 \n" + ] + } + ], + "source": [ + "# Load data\n", + "print(\"Loading data...\")\n", + "\n", + "# Load main dataset\n", + "train_data = pd.read_csv('dataset/data.csv')\n", + "print(f\"Main dataset (data.csv): {train_data.shape}\")\n", + "print(f\"Columns: {list(train_data.columns)}\")\n", + "\n", + "# Load validation data\n", + "validation_data = pd.read_csv('dataset/validation_data.csv')\n", + "print(f\"Validation dataset: {validation_data.shape}\")\n", + "print(f\"Columns: {list(validation_data.columns)}\")\n", + "\n", + "# Check for duplicates in main dataset\n", + "print(f\"\\nDuplicates in main dataset: {train_data.duplicated().sum()}\")\n", + "train_data = train_data.drop_duplicates()\n", + "print(f\"Main dataset size after removing duplicates: {train_data.shape}\")\n", + "\n", + "# Check for required columns\n", + "required_columns = ['title', 'text', 'label']\n", + "for dataset_name, dataset in [('train_data', train_data), ('validation_data', validation_data)]:\n", + " missing_columns = [col for col in required_columns if col not in dataset.columns]\n", + " if missing_columns:\n", + " print(f\"❌ {dataset_name} is missing columns: {missing_columns}\")\n", + " else:\n", + " print(f\"✅ {dataset_name} has all required columns!\")\n", + "\n", + "print(\"\\nFirst 3 rows of main dataset:\")\n", + "print(train_data.head(3))\n", + "print(\"\\nFirst 3 rows of validation dataset:\")\n", + "print(validation_data.head(3))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. NLP Preprocessing and Additional Feature Creation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Text processing functions created!\n" + ] + } + ], + "source": [ + "# Text cleaning function\n", + "def clean_text(text):\n", + " \"\"\"Clean text from extra characters and convert to lowercase\"\"\"\n", + " if pd.isna(text):\n", + " return \"\"\n", + " \n", + " # Convert to string and lowercase\n", + " text = str(text).lower()\n", + " \n", + " # Remove extra spaces\n", + " text = re.sub(r'\\s+', ' ', text)\n", + " \n", + " # Remove extra characters but preserve punctuation for counting\n", + " text = text.strip()\n", + " \n", + " return text\n", + "\n", + "# Function to count special characters (including bracket difference)\n", + "def count_special_chars(text):\n", + " \"\"\"Count special characters in text\"\"\"\n", + " if pd.isna(text):\n", + " return 0, 0, 0, 0\n", + " \n", + " text = str(text)\n", + " \n", + " # Count opening brackets\n", + " open_brackets = text.count('(') + text.count('[') + text.count('{')\n", + " \n", + " # Count closing brackets\n", + " closed_brackets = text.count(')') + text.count(']') + text.count('}')\n", + " \n", + " # Count exclamation marks\n", + " exclamation_marks = text.count('!')\n", + " \n", + " # Count question marks\n", + " question_marks = text.count('?')\n", + " \n", + " return open_brackets, closed_brackets, exclamation_marks, question_marks\n", + "\n", + "# Function to count characters\n", + "def count_characters(text):\n", + " \"\"\"Count number of characters in text\"\"\"\n", + " if pd.isna(text):\n", + " return 0\n", + " return len(str(text))\n", + "\n", + "# Function for vectorization preparation\n", + "def preprocess_for_vectorization(text):\n", + " \"\"\"Prepare text for vectorization with stopword removal\"\"\"\n", + " if pd.isna(text) or text == \"\":\n", + " return \"\"\n", + " \n", + " # Tokenization\n", + " tokens = word_tokenize(str(text))\n", + " \n", + " # Get stopwords\n", + " stop_words = set(stopwords.words('english'))\n", + " \n", + " # Remove stopwords and short words\n", + " filtered_tokens = [word for word in tokens \n", + " if word.lower() not in stop_words \n", + " and len(word) > 2 \n", + " and word.isalpha()]\n", + " \n", + " return ' '.join(filtered_tokens)\n", + "\n", + "print(\"✅ Text processing functions created!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing main dataset (data.csv)...\n", + "Cleaning text data...\n", + "Creating additional features...\n", + "Preparing texts for vectorization...\n", + "✅ Main dataset prepared!\n", + "New columns: ['title_clean', 'text_clean', 'open_brackets', 'closed_brackets', 'exclamation_marks', 'question_marks', 'bracket_difference', 'text_char_count', 'title_char_count', 'title_processed', 'text_processed', 'combined_text']\n", + "\n", + "Statistics for new features:\n", + " open_brackets closed_brackets bracket_difference exclamation_marks \\\n", + "count 39741.000000 39741.000000 39741.00000 39741.000000 \n", + "mean 1.824086 1.821796 0.00229 0.412118 \n", + "std 7.067190 7.052449 0.36591 1.483599 \n", + "min 0.000000 0.000000 -16.00000 0.000000 \n", + "25% 1.000000 1.000000 0.00000 0.000000 \n", + "50% 1.000000 1.000000 0.00000 0.000000 \n", + "75% 2.000000 2.000000 0.00000 0.000000 \n", + "max 1110.000000 1108.000000 34.00000 133.000000 \n", + "\n", + " question_marks text_char_count title_char_count \n", + "count 39741.000000 39741.000000 39741.000000 \n", + "mean 0.643240 2382.079288 79.840970 \n", + "std 1.746128 1766.137489 24.841118 \n", + "min 0.000000 1.000000 8.000000 \n", + "25% 0.000000 1255.000000 63.000000 \n", + "50% 0.000000 2195.000000 73.000000 \n", + "75% 1.000000 3065.000000 90.000000 \n", + "max 94.000000 49705.000000 286.000000 \n" + ] + } + ], + "source": [ + "# Prepare main dataset\n", + "print(\"Preparing main dataset (data.csv)...\")\n", + "\n", + "# Clean text data\n", + "print(\"Cleaning text data...\")\n", + "train_data['title_clean'] = train_data['title'].apply(clean_text)\n", + "train_data['text_clean'] = train_data['text'].apply(clean_text)\n", + "\n", + "# Create additional features\n", + "print(\"Creating additional features...\")\n", + "\n", + "# Count special characters in text\n", + "special_chars = train_data['text'].apply(count_special_chars)\n", + "train_data['open_brackets'] = [x[0] for x in special_chars]\n", + "train_data['closed_brackets'] = [x[1] for x in special_chars]\n", + "train_data['exclamation_marks'] = [x[2] for x in special_chars]\n", + "train_data['question_marks'] = [x[3] for x in special_chars]\n", + "\n", + "# New feature: difference between opening and closing brackets\n", + "train_data['bracket_difference'] = train_data['open_brackets'] - train_data['closed_brackets']\n", + "\n", + "# Count characters\n", + "train_data['text_char_count'] = train_data['text'].apply(count_characters)\n", + "train_data['title_char_count'] = train_data['title'].apply(count_characters)\n", + "\n", + "# Prepare for vectorization\n", + "print(\"Preparing texts for vectorization...\")\n", + "train_data['title_processed'] = train_data['title_clean'].apply(preprocess_for_vectorization)\n", + "train_data['text_processed'] = train_data['text_clean'].apply(preprocess_for_vectorization)\n", + "\n", + "# Combine title and text for vectorization\n", + "train_data['combined_text'] = train_data['title_processed'] + ' ' + train_data['text_processed']\n", + "\n", + "print(\"✅ Main dataset prepared!\")\n", + "print(f\"New columns: {[col for col in train_data.columns if col not in ['title', 'text', 'label', 'subject', 'date']]}\")\n", + "\n", + "# Statistics for new features\n", + "print(\"\\nStatistics for new features:\")\n", + "feature_cols = ['open_brackets', 'closed_brackets', 'bracket_difference', 'exclamation_marks', 'question_marks', 'text_char_count', 'title_char_count']\n", + "print(train_data[feature_cols].describe())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing validation dataset...\n", + "Cleaning text data...\n", + "Creating additional features...\n", + "Preparing texts for vectorization...\n", + "✅ Validation dataset prepared!\n", + "Validation dataset size: (4956, 17)\n", + "\n", + "Statistics for new features (validation):\n", + " open_brackets closed_brackets bracket_difference exclamation_marks \\\n", + "count 4956.000000 4956.000000 4956.000000 4956.000000 \n", + "mean 2.403753 2.410815 -0.007062 0.385997 \n", + "std 4.888734 4.891715 0.287216 1.239168 \n", + "min 0.000000 0.000000 -16.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 \n", + "50% 1.000000 1.000000 0.000000 0.000000 \n", + "75% 3.000000 3.000000 0.000000 0.000000 \n", + "max 88.000000 88.000000 1.000000 33.000000 \n", + "\n", + " question_marks text_char_count title_char_count \n", + "count 4956.000000 4956.000000 4956.000000 \n", + "mean 0.983253 3149.895278 82.826069 \n", + "std 2.146671 4131.961782 29.426493 \n", + "min 0.000000 1.000000 17.000000 \n", + "25% 0.000000 1130.250000 63.000000 \n", + "50% 0.000000 2053.500000 76.000000 \n", + "75% 1.000000 3530.000000 98.000000 \n", + "max 27.000000 51794.000000 286.000000 \n" + ] + } + ], + "source": [ + "# Prepare validation dataset\n", + "print(\"Preparing validation dataset...\")\n", + "\n", + "# Clean text data\n", + "print(\"Cleaning text data...\")\n", + "validation_data['title_clean'] = validation_data['title'].apply(clean_text)\n", + "validation_data['text_clean'] = validation_data['text'].apply(clean_text)\n", + "\n", + "# Create additional features\n", + "print(\"Creating additional features...\")\n", + "\n", + "# Count special characters in text\n", + "special_chars_val = validation_data['text'].apply(count_special_chars)\n", + "validation_data['open_brackets'] = [x[0] for x in special_chars_val]\n", + "validation_data['closed_brackets'] = [x[1] for x in special_chars_val]\n", + "validation_data['exclamation_marks'] = [x[2] for x in special_chars_val]\n", + "validation_data['question_marks'] = [x[3] for x in special_chars_val]\n", + "\n", + "# New feature: difference between opening and closing brackets\n", + "validation_data['bracket_difference'] = validation_data['open_brackets'] - validation_data['closed_brackets']\n", + "\n", + "# Count characters\n", + "validation_data['text_char_count'] = validation_data['text'].apply(count_characters)\n", + "validation_data['title_char_count'] = validation_data['title'].apply(count_characters)\n", + "\n", + "# Prepare for vectorization\n", + "print(\"Preparing texts for vectorization...\")\n", + "validation_data['title_processed'] = validation_data['title_clean'].apply(preprocess_for_vectorization)\n", + "validation_data['text_processed'] = validation_data['text_clean'].apply(preprocess_for_vectorization)\n", + "\n", + "# Combine title and text for vectorization\n", + "validation_data['combined_text'] = validation_data['title_processed'] + ' ' + validation_data['text_processed']\n", + "\n", + "print(\"✅ Validation dataset prepared!\")\n", + "print(f\"Validation dataset size: {validation_data.shape}\")\n", + "\n", + "# Statistics for new features in validation\n", + "print(\"\\nStatistics for new features (validation):\")\n", + "print(validation_data[feature_cols].describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. TF-IDF Vectorization\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Applying TF-IDF vectorization...\n", + "TF-IDF matrix size: (39741, 5000)\n", + "Number of features: 5000\n", + "✅ TF-IDF vectorization completed!\n" + ] + } + ], + "source": [ + "# TF-IDF vectorization\n", + "print(\"Applying TF-IDF vectorization...\")\n", + "\n", + "# Create TF-IDF vectorizer\n", + "tfidf_vectorizer = TfidfVectorizer(\n", + " max_features=5000, # Maximum number of features\n", + " ngram_range=(1, 2), # Unigrams and bigrams\n", + " min_df=2, # Minimum document frequency\n", + " max_df=0.95, # Maximum document frequency\n", + " stop_words='english'\n", + ")\n", + "\n", + "# Apply vectorization to combined text of main dataset\n", + "tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['combined_text'])\n", + "\n", + "print(f\"TF-IDF matrix size: {tfidf_matrix.shape}\")\n", + "print(f\"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}\")\n", + "\n", + "# Create DataFrame with TF-IDF features\n", + "tfidf_df = pd.DataFrame(\n", + " tfidf_matrix.toarray(),\n", + " columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])]\n", + ")\n", + "\n", + "print(\"✅ TF-IDF vectorization completed!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Data Preparation for Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing features for training...\n", + "Feature matrix size: (39741, 5007)\n", + "Target variable size: (39741,)\n", + "Class distribution: [19940 19801]\n", + "Training set size: (31792, 5007)\n", + "Test set size: (7949, 5007)\n", + "✅ Data prepared for training!\n" + ] + } + ], + "source": [ + "# Prepare features for training\n", + "print(\"Preparing features for training...\")\n", + "\n", + "# Select additional features (including new bracket_difference feature)\n", + "additional_features = ['open_brackets', 'closed_brackets', 'bracket_difference', 'exclamation_marks', 'question_marks', 'text_char_count', 'title_char_count']\n", + "\n", + "# Combine TF-IDF features with additional features\n", + "X_tfidf = tfidf_df.values\n", + "X_additional = train_data[additional_features].values\n", + "\n", + "# Combine all features\n", + "X_combined = np.hstack([X_tfidf, X_additional])\n", + "\n", + "# Target variable\n", + "y = train_data['label'].values\n", + "\n", + "print(f\"Feature matrix size: {X_combined.shape}\")\n", + "print(f\"Target variable size: {y.shape}\")\n", + "print(f\"Class distribution: {np.bincount(y)}\")\n", + "\n", + "# Train/test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_combined, y, \n", + " test_size=0.2, \n", + " random_state=42, \n", + " stratify=y\n", + ")\n", + "\n", + "print(f\"Training set size: {X_train.shape}\")\n", + "print(f\"Test set size: {X_test.shape}\")\n", + "print(\"✅ Data prepared for training!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Model Training and Comparison\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Models defined:\n", + "- RandomForest: RandomForestClassifier\n", + "- XGBoost: XGBClassifier\n" + ] + } + ], + "source": [ + "# Define models with different parameters (only Random Forest and XGBoost)\n", + "models = {\n", + " 'RandomForest': RandomForestClassifier(\n", + " n_estimators=200,\n", + " max_depth=15,\n", + " min_samples_split=5,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " ),\n", + " 'XGBoost': xgb.XGBClassifier(\n", + " n_estimators=200,\n", + " max_depth=8,\n", + " learning_rate=0.05,\n", + " subsample=0.8,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + "}\n", + "\n", + "print(\"Models defined:\")\n", + "for name, model in models.items():\n", + " print(f\"- {name}: {type(model).__name__}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training and evaluating models...\n", + "==================================================\n", + "\n", + "Training model: RandomForest\n", + "Test accuracy: 0.9965\n", + "Cross-validation (mean ± std): 0.9961 ± 0.0007\n", + "\n", + "Training model: XGBoost\n", + "Test accuracy: 0.9984\n", + "Cross-validation (mean ± std): 0.9974 ± 0.0012\n", + "\n", + "==================================================\n", + "✅ Training of all models completed!\n" + ] + } + ], + "source": [ + "# Train and evaluate models\n", + "results = {}\n", + "\n", + "print(\"Training and evaluating models...\")\n", + "print(\"=\" * 50)\n", + "\n", + "for name, model in models.items():\n", + " print(f\"\\nTraining model: {name}\")\n", + " \n", + " # Train model\n", + " model.fit(X_train, y_train)\n", + " \n", + " # Predictions\n", + " y_pred = model.predict(X_test)\n", + " \n", + " # Calculate accuracy\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " \n", + " # Cross-validation\n", + " cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n", + " \n", + " # Save results\n", + " results[name] = {\n", + " 'model': model,\n", + " 'accuracy': accuracy,\n", + " 'cv_mean': cv_scores.mean(),\n", + " 'cv_std': cv_scores.std(),\n", + " 'predictions': y_pred\n", + " }\n", + " \n", + " print(f\"Test accuracy: {accuracy:.4f}\")\n", + " print(f\"Cross-validation (mean ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 50)\n", + "print(\"✅ Training of all models completed!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Confusion Matrices and Model Comparison\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model comparison results:\n", + "============================================================\n", + " Model Test_Accuracy CV_Mean CV_Std\n", + " XGBoost 0.9984 0.9974 0.0012\n", + "RandomForest 0.9965 0.9961 0.0007\n", + "\n", + "🏆 Best model: XGBoost\n", + "Accuracy: 0.9984\n", + "Model type: XGBClassifier\n", + "\n", + "Detailed report for XGBoost:\n", + "========================================\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 3988\n", + " 1 1.00 1.00 1.00 3961\n", + "\n", + " accuracy 1.00 7949\n", + " macro avg 1.00 1.00 1.00 7949\n", + "weighted avg 1.00 1.00 1.00 7949\n", + "\n" + ] + } + ], + "source": [ + "# Create comparison table of results\n", + "results_df = pd.DataFrame({\n", + " 'Model': list(results.keys()),\n", + " 'Test_Accuracy': [results[name]['accuracy'] for name in results.keys()],\n", + " 'CV_Mean': [results[name]['cv_mean'] for name in results.keys()],\n", + " 'CV_Std': [results[name]['cv_std'] for name in results.keys()]\n", + "})\n", + "\n", + "# Sort by test accuracy\n", + "results_df = results_df.sort_values('Test_Accuracy', ascending=False)\n", + "\n", + "print(\"Model comparison results:\")\n", + "print(\"=\" * 60)\n", + "print(results_df.to_string(index=False, float_format='%.4f'))\n", + "\n", + "# Select best model\n", + "best_model_name = results_df.iloc[0]['Model']\n", + "best_model = results[best_model_name]['model']\n", + "best_accuracy = results_df.iloc[0]['Test_Accuracy']\n", + "\n", + "print(f\"\\n🏆 Best model: {best_model_name}\")\n", + "print(f\"Accuracy: {best_accuracy:.4f}\")\n", + "print(f\"Model type: {type(best_model).__name__}\")\n", + "\n", + "# Detailed report for best model\n", + "print(f\"\\nDetailed report for {best_model_name}:\")\n", + "print(\"=\" * 40)\n", + "y_pred_best = results[best_model_name]['predictions']\n", + "print(classification_report(y_test, y_pred_best))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix Details:\n", + "==================================================\n", + "\n", + "RandomForest:\n", + "True Negatives (Real News correctly classified): 3964\n", + "False Positives (Real News misclassified as Fake): 24\n", + "False Negatives (Fake News misclassified as Real): 4\n", + "True Positives (Fake News correctly classified): 3957\n", + "Precision: 0.9940\n", + "Recall: 0.9990\n", + "F1-Score: 0.9965\n", + "\n", + "XGBoost:\n", + "True Negatives (Real News correctly classified): 3983\n", + "False Positives (Real News misclassified as Fake): 5\n", + "False Negatives (Fake News misclassified as Real): 8\n", + "True Positives (Fake News correctly classified): 3953\n", + "Precision: 0.9987\n", + "Recall: 0.9980\n", + "F1-Score: 0.9984\n" + ] + } + ], + "source": [ + "# Create confusion matrices for both models\n", + "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n", + "\n", + "for i, (name, result) in enumerate(results.items()):\n", + " # Calculate confusion matrix\n", + " cm = confusion_matrix(y_test, result['predictions'])\n", + " \n", + " # Plot confusion matrix\n", + " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])\n", + " axes[i].set_title(f'Confusion Matrix - {name}\\nAccuracy: {result[\"accuracy\"]:.4f}')\n", + " axes[i].set_xlabel('Predicted')\n", + " axes[i].set_ylabel('Actual')\n", + " \n", + " # Add class labels\n", + " axes[i].set_xticklabels(['Real News', 'Fake News'])\n", + " axes[i].set_yticklabels(['Real News', 'Fake News'])\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print confusion matrix details\n", + "print(\"Confusion Matrix Details:\")\n", + "print(\"=\" * 50)\n", + "for name, result in results.items():\n", + " cm = confusion_matrix(y_test, result['predictions'])\n", + " print(f\"\\n{name}:\")\n", + " print(f\"True Negatives (Real News correctly classified): {cm[0,0]}\")\n", + " print(f\"False Positives (Real News misclassified as Fake): {cm[0,1]}\")\n", + " print(f\"False Negatives (Fake News misclassified as Real): {cm[1,0]}\")\n", + " print(f\"True Positives (Fake News correctly classified): {cm[1,1]}\")\n", + " \n", + " # Calculate additional metrics\n", + " precision = cm[1,1] / (cm[1,1] + cm[0,1]) if (cm[1,1] + cm[0,1]) > 0 else 0\n", + " recall = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0\n", + " f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n", + " \n", + " print(f\"Precision: {precision:.4f}\")\n", + " print(f\"Recall: {recall:.4f}\")\n", + " print(f\"F1-Score: {f1:.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Training on Full Dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training best model (XGBoost) on full dataset...\n", + "Full feature matrix size: (39741, 5007)\n", + "Target variable size: (39741,)\n", + "Class distribution: [19940 19801]\n", + "Training model...\n", + "✅ Model trained on full dataset!\n" + ] + } + ], + "source": [ + "# Train best model on full dataset\n", + "print(f\"Training best model ({best_model_name}) on full dataset...\")\n", + "\n", + "# Prepare full dataset\n", + "X_full = X_combined\n", + "y_full = y\n", + "\n", + "print(f\"Full feature matrix size: {X_full.shape}\")\n", + "print(f\"Target variable size: {y_full.shape}\")\n", + "print(f\"Class distribution: {np.bincount(y_full)}\")\n", + "\n", + "# Create new model with same parameters\n", + "if best_model_name == 'XGBoost':\n", + " full_model = xgb.XGBClassifier(**best_model.get_params())\n", + "else:\n", + " full_model = RandomForestClassifier(**best_model.get_params())\n", + "\n", + "# Train model on full dataset\n", + "print(\"Training model...\")\n", + "full_model.fit(X_full, y_full)\n", + "\n", + "print(\"✅ Model trained on full dataset!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Feature Importance Analysis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature importance analysis...\n", + "Top-20 most important features:\n", + " feature importance\n", + "3757 tfidf_3757 0.458675\n", + "5000 open_brackets 0.008268\n", + "1576 tfidf_1576 0.005496\n", + "411 tfidf_411 0.005359\n", + "2419 tfidf_2419 0.005231\n", + "1952 tfidf_1952 0.004854\n", + "3761 tfidf_3761 0.004222\n", + "1830 tfidf_1830 0.004162\n", + "1886 tfidf_1886 0.004020\n", + "2292 tfidf_2292 0.003933\n", + "3959 tfidf_3959 0.003515\n", + "1623 tfidf_1623 0.003495\n", + "65 tfidf_65 0.003129\n", + "412 tfidf_412 0.003011\n", + "5006 title_char_count 0.002945\n", + "3477 tfidf_3477 0.002854\n", + "3487 tfidf_3487 0.002835\n", + "4995 tfidf_4995 0.002822\n", + "1729 tfidf_1729 0.002754\n", + "2949 tfidf_2949 0.002634\n", + "\n", + "Importance of additional features:\n", + " feature importance\n", + "5000 open_brackets 0.008268\n", + "5006 title_char_count 0.002945\n", + "5004 question_marks 0.001684\n", + "5003 exclamation_marks 0.001666\n", + "5005 text_char_count 0.001269\n", + "5001 closed_brackets 0.000415\n", + "5002 bracket_difference 0.000000\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Feature importance analysis completed!\n" + ] + } + ], + "source": [ + "# Feature importance analysis\n", + "print(\"Feature importance analysis...\")\n", + "\n", + "# Get feature importance\n", + "feature_importance = full_model.feature_importances_\n", + "\n", + "# Create list of feature names\n", + "tfidf_feature_names = [f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])]\n", + "all_feature_names = tfidf_feature_names + additional_features\n", + "\n", + "# Create DataFrame with feature importance\n", + "importance_df = pd.DataFrame({\n", + " 'feature': all_feature_names,\n", + " 'importance': feature_importance\n", + "}).sort_values('importance', ascending=False)\n", + "\n", + "print(\"Top-20 most important features:\")\n", + "print(importance_df.head(20))\n", + "\n", + "# Analyze importance of additional features\n", + "print(f\"\\nImportance of additional features:\")\n", + "additional_importance = importance_df[importance_df['feature'].isin(additional_features)]\n", + "print(additional_importance)\n", + "\n", + "# Visualize feature importance\n", + "plt.figure(figsize=(12, 8))\n", + "\n", + "# Top-20 TF-IDF features\n", + "top_tfidf = importance_df[importance_df['feature'].str.startswith('tfidf_')].head(20)\n", + "plt.subplot(2, 1, 1)\n", + "plt.barh(range(len(top_tfidf)), top_tfidf['importance'])\n", + "plt.yticks(range(len(top_tfidf)), [f\"TF-IDF {i}\" for i in range(len(top_tfidf))])\n", + "plt.xlabel('Importance')\n", + "plt.title('Top-20 TF-IDF Features by Importance')\n", + "plt.gca().invert_yaxis()\n", + "\n", + "# Additional features\n", + "plt.subplot(2, 1, 2)\n", + "plt.barh(range(len(additional_importance)), additional_importance['importance'])\n", + "plt.yticks(range(len(additional_importance)), additional_importance['feature'])\n", + "plt.xlabel('Importance')\n", + "plt.title('Additional Features Importance')\n", + "plt.gca().invert_yaxis()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"✅ Feature importance analysis completed!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Validation Data Predictions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing validation data for predictions...\n", + "Applying TF-IDF vectorization to validation data...\n", + "TF-IDF matrix size for validation: (4956, 5000)\n", + "Feature matrix size for validation: (4956, 5007)\n", + "✅ Feature sizes match!\n", + "✅ Validation data prepared for predictions!\n" + ] + } + ], + "source": [ + "# Prepare validation data for predictions\n", + "print(\"Preparing validation data for predictions...\")\n", + "\n", + "# Apply TF-IDF vectorization to validation data\n", + "print(\"Applying TF-IDF vectorization to validation data...\")\n", + "tfidf_matrix_val = tfidf_vectorizer.transform(validation_data['combined_text'])\n", + "\n", + "print(f\"TF-IDF matrix size for validation: {tfidf_matrix_val.shape}\")\n", + "\n", + "# Create DataFrame with TF-IDF features for validation\n", + "tfidf_df_val = pd.DataFrame(\n", + " tfidf_matrix_val.toarray(),\n", + " columns=[f'tfidf_{i}' for i in range(tfidf_matrix_val.shape[1])]\n", + ")\n", + "\n", + "# Prepare features for validation\n", + "X_tfidf_val = tfidf_df_val.values\n", + "X_additional_val = validation_data[additional_features].values\n", + "\n", + "# Combine all features for validation\n", + "X_combined_val = np.hstack([X_tfidf_val, X_additional_val])\n", + "\n", + "print(f\"Feature matrix size for validation: {X_combined_val.shape}\")\n", + "\n", + "# Check size compatibility\n", + "if X_combined_val.shape[1] != X_combined.shape[1]:\n", + " print(f\"⚠️ Warning: Feature sizes don't match!\")\n", + " print(f\"Training data: {X_combined.shape[1]} features\")\n", + " print(f\"Validation data: {X_combined_val.shape[1]} features\")\n", + "else:\n", + " print(\"✅ Feature sizes match!\")\n", + "\n", + "print(\"✅ Validation data prepared for predictions!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Making predictions on validation data...\n", + "Number of predictions: 4956\n", + "Predicted class distribution: [3501 1455]\n", + "✅ Predictions completed!\n", + "Final dataset size: (4956, 19)\n", + "\n", + "Prediction examples:\n", + " title \\\n", + "0 UK's May 'receiving regular updates' on London... \n", + "1 UK transport police leading investigation of L... \n", + "2 Pacific nations crack down on North Korean shi... \n", + "3 Three suspected al Qaeda militants killed in Y... \n", + "4 Chinese academics prod Beijing to consider Nor... \n", + "5 Flames raced along train at west London statio... \n", + "6 London police advise people to avoid area near... \n", + "7 London ambulance service sends hazardous area ... \n", + "8 Witness says injured in stampede at London sta... \n", + "9 UK says world will stand together against Nort... \n", + "\n", + " text label \\\n", + "0 LONDON (Reuters) - British Prime Minister Ther... 1 \n", + "1 LONDON (Reuters) - British counter-terrorism p... 1 \n", + "2 WELLINGTON (Reuters) - South Pacific island na... 1 \n", + "3 ADEN, Yemen (Reuters) - Three suspected al Qae... 1 \n", + "4 BEIJING (Reuters) - Chinese academics are publ... 1 \n", + "5 LONDON (Reuters) - Flames engulfed one carriag... 1 \n", + "6 LONDON (Reuters) - British police on Friday ad... 1 \n", + "7 LONDON (Reuters) - London s ambulance service ... 1 \n", + "8 LONDON (Reuters) - A woman at London s Parsons... 1 \n", + "9 LONDON (Reuters) - Britain said on Friday the ... 1 \n", + "\n", + " prediction_proba_0 prediction_proba_1 \n", + "0 0.000225 0.999775 \n", + "1 0.001080 0.998920 \n", + "2 0.000311 0.999689 \n", + "3 0.000137 0.999863 \n", + "4 0.000157 0.999843 \n", + "5 0.000090 0.999910 \n", + "6 0.000097 0.999903 \n", + "7 0.000137 0.999863 \n", + "8 0.000114 0.999886 \n", + "9 0.000159 0.999841 \n" + ] + } + ], + "source": [ + "# Make predictions on validation data\n", + "print(\"Making predictions on validation data...\")\n", + "\n", + "# Predictions\n", + "predictions = full_model.predict(X_combined_val)\n", + "prediction_proba = full_model.predict_proba(X_combined_val)\n", + "\n", + "print(f\"Number of predictions: {len(predictions)}\")\n", + "print(f\"Predicted class distribution: {np.bincount(predictions)}\")\n", + "\n", + "# Create copy of validation data with predictions\n", + "validation_labeled = validation_data.copy()\n", + "validation_labeled['label'] = predictions\n", + "validation_labeled['prediction_proba_0'] = prediction_proba[:, 0]\n", + "validation_labeled['prediction_proba_1'] = prediction_proba[:, 1]\n", + "\n", + "print(\"✅ Predictions completed!\")\n", + "print(f\"Final dataset size: {validation_labeled.shape}\")\n", + "\n", + "# Show prediction examples\n", + "print(\"\\nPrediction examples:\")\n", + "sample_predictions = validation_labeled[['title', 'text', 'label', 'prediction_proba_0', 'prediction_proba_1']].head(10)\n", + "print(sample_predictions)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Save Results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving best model: XGBoost\n", + "✅ Model saved to file: best_fake_news_classifier_xgboost.pkl\n", + "✅ Model information saved to model_info.txt\n", + "\n", + "Model ready for use! File: best_fake_news_classifier_xgboost.pkl\n" + ] + } + ], + "source": [ + "# Save best model and necessary components\n", + "print(f\"Saving best model: {best_model_name}\")\n", + "\n", + "# Create dictionary with model and components\n", + "model_package = {\n", + " 'model': full_model,\n", + " 'tfidf_vectorizer': tfidf_vectorizer,\n", + " 'model_name': best_model_name,\n", + " 'accuracy': best_accuracy,\n", + " 'feature_names': additional_features,\n", + " 'preprocessing_functions': {\n", + " 'clean_text': clean_text,\n", + " 'count_special_chars': count_special_chars,\n", + " 'count_characters': count_characters,\n", + " 'preprocess_for_vectorization': preprocess_for_vectorization\n", + " }\n", + "}\n", + "\n", + "# Save model\n", + "model_filename = f'best_fake_news_classifier_{best_model_name.lower()}.pkl'\n", + "joblib.dump(model_package, model_filename)\n", + "\n", + "print(f\"✅ Model saved to file: {model_filename}\")\n", + "\n", + "# Create model information file\n", + "model_info = f\"\"\"\n", + "Information about saved model:\n", + "================================\n", + "Model name: {best_model_name}\n", + "Model type: {type(full_model).__name__}\n", + "Test accuracy: {best_accuracy:.4f}\n", + "Creation date: {pd.Timestamp.now()}\n", + "\n", + "Model parameters:\n", + "{full_model.get_params()}\n", + "\n", + "Used features:\n", + "- TF-IDF vectorization (5000 features)\n", + "- Number of opening brackets\n", + "- Number of closing brackets\n", + "- Difference between opening and closing brackets (NEW FEATURE)\n", + "- Number of exclamation marks \n", + "- Number of question marks\n", + "- Number of characters in text\n", + "- Number of characters in title\n", + "\n", + "Model file: {model_filename}\n", + "\"\"\"\n", + "\n", + "with open('model_info.txt', 'w', encoding='utf-8') as f:\n", + " f.write(model_info)\n", + "\n", + "print(\"✅ Model information saved to model_info.txt\")\n", + "print(f\"\\nModel ready for use! File: {model_filename}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving validation results...\n", + "✅ Validation data with predictions saved to file: dataset/validation_XGBoost_labeled_data.csv\n", + "✅ Feature importance analysis saved to file: feature_importance_analysis.csv\n", + "✅ Summary report saved to file: XGBoost_training_report.txt\n", + "\n", + "============================================================\n", + "🎉 ALL TASKS COMPLETED SUCCESSFULLY!\n", + "============================================================\n", + "📁 Main result: dataset/validation_XGBoost_labeled_data.csv\n", + "📊 Feature analysis: feature_importance_analysis.csv\n", + "🤖 Model: best_fake_news_classifier_xgboost.pkl\n", + "📋 Report: XGBoost_training_report.txt\n", + "============================================================\n" + ] + } + ], + "source": [ + "# Save validation results\n", + "print(\"Saving validation results...\")\n", + "\n", + "# Save validation data with predictions in correct format\n", + "output_filename = f'dataset/validation_{best_model_name}_labeled_data.csv'\n", + "\n", + "# Create final dataset with predictions, keeping only necessary columns\n", + "final_validation_data = validation_data[['label', 'title', 'text', 'subject', 'date']].copy()\n", + "final_validation_data['label'] = predictions # Replace original labels with predictions\n", + "\n", + "final_validation_data.to_csv(output_filename, index=False)\n", + "\n", + "print(f\"✅ Validation data with predictions saved to file: {output_filename}\")\n", + "\n", + "# Save feature importance information\n", + "importance_filename = 'feature_importance_analysis.csv'\n", + "importance_df.to_csv(importance_filename, index=False)\n", + "\n", + "print(f\"✅ Feature importance analysis saved to file: {importance_filename}\")\n", + "\n", + "# Create summary report\n", + "summary_report = f\"\"\"\n", + "Report on training {best_model_name} model on full dataset\n", + "==================================================\n", + "\n", + "Creation date: {pd.Timestamp.now()}\n", + "\n", + "Data:\n", + "- Main dataset (data.csv): {train_data.shape[0]} records\n", + "- Validation dataset: {validation_data.shape[0]} records\n", + "- Number of features: {X_combined.shape[1]}\n", + "\n", + "Model:\n", + "- Type: {type(full_model).__name__}\n", + "- Parameters: {full_model.get_params()}\n", + "\n", + "Results:\n", + "- Number of predictions: {len(predictions)}\n", + "- Class distribution: {dict(zip(*np.unique(predictions, return_counts=True)))}\n", + "\n", + "New features:\n", + "- Added 'bracket_difference' feature (difference between opening and closing brackets)\n", + "\n", + "Files:\n", + "- Validation with predictions: {output_filename}\n", + "- Feature importance analysis: {importance_filename}\n", + "- Model: {model_filename}\n", + "\n", + "Top-5 most important features:\n", + "{importance_df.head(5).to_string(index=False)}\n", + "\"\"\"\n", + "\n", + "# Save report\n", + "with open(f'{best_model_name}_training_report.txt', 'w', encoding='utf-8') as f:\n", + " f.write(summary_report)\n", + "\n", + "print(f\"✅ Summary report saved to file: {best_model_name}_training_report.txt\")\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"🎉 ALL TASKS COMPLETED SUCCESSFULLY!\")\n", + "print(\"=\"*60)\n", + "print(f\"📁 Main result: {output_filename}\")\n", + "print(f\"📊 Feature analysis: {importance_filename}\")\n", + "print(f\"🤖 Model: {model_filename}\")\n", + "print(f\"📋 Report: {best_model_name}_training_report.txt\")\n", + "print(\"=\"*60)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/3. models_comparison.ipynb b/3. models_comparison.ipynb new file mode 100644 index 0000000..87c408e --- /dev/null +++ b/3. models_comparison.ipynb @@ -0,0 +1,1040 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Comparison and Metrics Analysis\n", + "\n", + "This notebook compares different validation datasets against the reference dataset (`df_correct_validation`) and calculates key performance metrics including precision, recall, F1-score, and confusion matrices.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.metrics import (\n", + " confusion_matrix, \n", + " classification_report, \n", + " precision_score, \n", + " recall_score, \n", + " f1_score, \n", + " accuracy_score,\n", + " roc_auc_score\n", + ")\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Set style for better plots\n", + "plt.style.use('default')\n", + "sns.set_palette(\"husl\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Reference Dataset\n", + "\n", + "The reference dataset (`df_correct_validation`) will be used as the ground truth for comparison.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reference dataset shape: (4956, 5)\n", + "Reference dataset columns: ['label', 'title', 'text', 'subject', 'date']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labeltitletextsubjectdate
01UK's May 'receiving regular updates' on London...LONDON (Reuters) - British Prime Minister Ther...worldnewsSeptember 15, 2017
11UK transport police leading investigation of L...LONDON (Reuters) - British counter-terrorism p...worldnewsSeptember 15, 2017
21Pacific nations crack down on North Korean shi...WELLINGTON (Reuters) - South Pacific island na...worldnewsSeptember 15, 2017
31Three suspected al Qaeda militants killed in Y...ADEN, Yemen (Reuters) - Three suspected al Qae...worldnewsSeptember 15, 2017
41Chinese academics prod Beijing to consider Nor...BEIJING (Reuters) - Chinese academics are publ...worldnewsSeptember 15, 2017
\n", + "
" + ], + "text/plain": [ + " label title \\\n", + "0 1 UK's May 'receiving regular updates' on London... \n", + "1 1 UK transport police leading investigation of L... \n", + "2 1 Pacific nations crack down on North Korean shi... \n", + "3 1 Three suspected al Qaeda militants killed in Y... \n", + "4 1 Chinese academics prod Beijing to consider Nor... \n", + "\n", + " text subject \\\n", + "0 LONDON (Reuters) - British Prime Minister Ther... worldnews \n", + "1 LONDON (Reuters) - British counter-terrorism p... worldnews \n", + "2 WELLINGTON (Reuters) - South Pacific island na... worldnews \n", + "3 ADEN, Yemen (Reuters) - Three suspected al Qae... worldnews \n", + "4 BEIJING (Reuters) - Chinese academics are publ... worldnews \n", + "\n", + " date \n", + "0 September 15, 2017 \n", + "1 September 15, 2017 \n", + "2 September 15, 2017 \n", + "3 September 15, 2017 \n", + "4 September 15, 2017 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load reference dataset\n", + "df_correct_validation = pd.read_csv(\"dataset/validation_data_simple_labeled.csv\")\n", + "print(f\"Reference dataset shape: {df_correct_validation.shape}\")\n", + "print(f\"Reference dataset columns: {df_correct_validation.columns.tolist()}\")\n", + "df_correct_validation.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load All Validation Datasets for Comparison\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First Classifier: (4956, 5)\n", + " Label distribution: {0: 3484, 1: 1472}\n", + "\n", + "Embeddings: (4956, 5)\n", + " Label distribution: {0: 3235, 1: 1721}\n", + "\n", + "Advanced Embeddings: (4956, 5)\n", + " Label distribution: {0: 4398, 1: 558}\n", + "\n", + "Minus Reuters: (4956, 10)\n", + " Label distribution: {0: 3193, 1: 1763}\n", + "\n", + "XGBoost: (4956, 5)\n", + " Label distribution: {0: 3501, 1: 1455}\n", + "\n" + ] + } + ], + "source": [ + "# Load all validation datasets\n", + "datasets = {\n", + " 'First Classifier': pd.read_csv(\"dataset/validation_data_labeled.csv\"),\n", + " 'Embeddings': pd.read_csv(\"dataset/validation_data_labeled_embeddings.csv\"),\n", + " 'Advanced Embeddings': pd.read_csv(\"dataset/validation_data_labeled_embeddings_advanced.csv\"),\n", + " 'Minus Reuters': pd.read_csv(\"dataset/validation_data_labeled_minus_reuters.csv\"),\n", + " 'XGBoost': pd.read_csv(\"dataset/validation_XGBoost_labeled_data.csv\")\n", + "}\n", + "\n", + "# Display dataset information\n", + "for name, df in datasets.items():\n", + " print(f\"{name}: {df.shape}\")\n", + " if 'label' in df.columns:\n", + " print(f\" Label distribution: {df['label'].value_counts().to_dict()}\")\n", + " print()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Label Distribution Visualization\n", + "\n", + "Create pie charts showing the distribution of labels (0 and 1) for each dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create pie charts for all datasets\n", + "fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n", + "axes = axes.flatten()\n", + "\n", + "# Reference dataset\n", + "counts = df_correct_validation['label'].value_counts()\n", + "labels = ['0', '1']\n", + "sizes = [counts.get(0, 0), counts.get(1, 0)]\n", + "axes[0].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, \n", + " colors=['lightcoral', 'lightblue'])\n", + "axes[0].set_title('Reference Dataset\\n(df_correct_validation)', fontsize=12, fontweight='bold')\n", + "\n", + "# Other datasets\n", + "dataset_names = list(datasets.keys())\n", + "for i, (name, df) in enumerate(datasets.items(), 1):\n", + " if 'label' in df.columns:\n", + " counts = df['label'].value_counts()\n", + " sizes = [counts.get(0, 0), counts.get(1, 0)]\n", + " axes[i].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, \n", + " colors=['lightcoral', 'lightblue'])\n", + " axes[i].set_title(f'{name}', fontsize=12, fontweight='bold')\n", + " else:\n", + " axes[i].text(0.5, 0.5, 'No label column', ha='center', va='center', transform=axes[i].transAxes)\n", + " axes[i].set_title(f'{name}\\n(No labels)', fontsize=12, fontweight='bold')\n", + "\n", + "# Hide the last subplot if not needed\n", + "if len(datasets) < 5:\n", + " axes[-1].set_visible(False)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics Calculation Function\n", + "\n", + "Define a function to calculate all relevant metrics for comparison.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_metrics(y_true, y_pred, dataset_name):\n", + " \"\"\"\n", + " Calculate comprehensive metrics for model evaluation\n", + " \"\"\"\n", + " try:\n", + " # Ensure both arrays have the same length\n", + " min_len = min(len(y_true), len(y_pred))\n", + " y_true = y_true[:min_len]\n", + " y_pred = y_pred[:min_len]\n", + " \n", + " # Calculate metrics\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)\n", + " recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)\n", + " f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)\n", + " \n", + " # Calculate per-class metrics\n", + " precision_0 = precision_score(y_true, y_pred, pos_label=0, zero_division=0)\n", + " recall_0 = recall_score(y_true, y_pred, pos_label=0, zero_division=0)\n", + " f1_0 = f1_score(y_true, y_pred, pos_label=0, zero_division=0)\n", + " \n", + " precision_1 = precision_score(y_true, y_pred, pos_label=1, zero_division=0)\n", + " recall_1 = recall_score(y_true, y_pred, pos_label=1, zero_division=0)\n", + " f1_1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)\n", + " \n", + " # Try to calculate AUC (might fail if only one class present)\n", + " try:\n", + " auc = roc_auc_score(y_true, y_pred)\n", + " except:\n", + " auc = np.nan\n", + " \n", + " return {\n", + " 'Dataset': dataset_name,\n", + " 'Accuracy': accuracy,\n", + " 'Precision (Weighted)': precision,\n", + " 'Recall (Weighted)': recall,\n", + " 'F1-Score (Weighted)': f1,\n", + " 'Precision (Class 0)': precision_0,\n", + " 'Recall (Class 0)': recall_0,\n", + " 'F1-Score (Class 0)': f1_0,\n", + " 'Precision (Class 1)': precision_1,\n", + " 'Recall (Class 1)': recall_1,\n", + " 'F1-Score (Class 1)': f1_1,\n", + " 'AUC': auc,\n", + " 'Total Samples': len(y_true)\n", + " }\n", + " except Exception as e:\n", + " print(f\"Error calculating metrics for {dataset_name}: {e}\")\n", + " return None\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calculate Metrics for All Datasets\n", + "\n", + "Compare each dataset against the reference dataset and calculate comprehensive metrics.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculating metrics for First Classifier...\n", + "Calculating metrics for Embeddings...\n", + "Calculating metrics for Advanced Embeddings...\n", + "Calculating metrics for Minus Reuters...\n", + "Calculating metrics for XGBoost...\n", + "\n", + "Metrics calculated successfully!\n" + ] + } + ], + "source": [ + "# Calculate metrics for all datasets\n", + "metrics_results = []\n", + "\n", + "# Get reference labels\n", + "reference_labels = df_correct_validation['label'].values\n", + "\n", + "for name, df in datasets.items():\n", + " if 'label' in df.columns:\n", + " print(f\"Calculating metrics for {name}...\")\n", + " dataset_labels = df['label'].values\n", + " \n", + " # Calculate metrics\n", + " metrics = calculate_metrics(reference_labels, dataset_labels, name)\n", + " if metrics:\n", + " metrics_results.append(metrics)\n", + " else:\n", + " print(f\"Skipping {name} - no 'label' column found\")\n", + "\n", + "# Create metrics DataFrame\n", + "if metrics_results:\n", + " metrics_df = pd.DataFrame(metrics_results)\n", + " print(\"\\nMetrics calculated successfully!\")\n", + "else:\n", + " print(\"No metrics could be calculated.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics Summary Table\n", + "\n", + "Display all calculated metrics in a comprehensive table.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "COMPREHENSIVE METRICS COMPARISON\n", + "================================================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetAccuracyPrecision (Weighted)Recall (Weighted)F1-Score (Weighted)Precision (Class 0)Recall (Class 0)F1-Score (Class 0)Precision (Class 1)Recall (Class 1)F1-Score (Class 1)AUCTotal Samples
0First Classifier0.98910.98950.98910.98921.00000.98470.99230.96331.00000.98130.99244956
1Embeddings0.92550.93690.92550.92750.98980.90500.94550.80480.97670.88240.94094956
2Advanced Embeddings0.71670.67680.71670.67040.74260.92310.82310.51250.20170.28950.56244956
3Minus Reuters0.92470.93860.92470.92690.99560.89850.94460.79640.99010.88270.94434956
4XGBoost0.99210.99230.99210.99220.99970.98930.99450.97390.99930.98640.99434956
\n", + "
" + ], + "text/plain": [ + " Dataset Accuracy Precision (Weighted) Recall (Weighted) \\\n", + "0 First Classifier 0.9891 0.9895 0.9891 \n", + "1 Embeddings 0.9255 0.9369 0.9255 \n", + "2 Advanced Embeddings 0.7167 0.6768 0.7167 \n", + "3 Minus Reuters 0.9247 0.9386 0.9247 \n", + "4 XGBoost 0.9921 0.9923 0.9921 \n", + "\n", + " F1-Score (Weighted) Precision (Class 0) Recall (Class 0) \\\n", + "0 0.9892 1.0000 0.9847 \n", + "1 0.9275 0.9898 0.9050 \n", + "2 0.6704 0.7426 0.9231 \n", + "3 0.9269 0.9956 0.8985 \n", + "4 0.9922 0.9997 0.9893 \n", + "\n", + " F1-Score (Class 0) Precision (Class 1) Recall (Class 1) \\\n", + "0 0.9923 0.9633 1.0000 \n", + "1 0.9455 0.8048 0.9767 \n", + "2 0.8231 0.5125 0.2017 \n", + "3 0.9446 0.7964 0.9901 \n", + "4 0.9945 0.9739 0.9993 \n", + "\n", + " F1-Score (Class 1) AUC Total Samples \n", + "0 0.9813 0.9924 4956 \n", + "1 0.8824 0.9409 4956 \n", + "2 0.2895 0.5624 4956 \n", + "3 0.8827 0.9443 4956 \n", + "4 0.9864 0.9943 4956 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metrics saved to 'dataset_comparison_metrics.csv'\n" + ] + } + ], + "source": [ + "if 'metrics_df' in locals():\n", + " # Display the metrics table\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"COMPREHENSIVE METRICS COMPARISON\")\n", + " print(\"=\"*80)\n", + " \n", + " # Round numeric columns for better readability\n", + " numeric_columns = metrics_df.select_dtypes(include=[np.number]).columns\n", + " metrics_df_display = metrics_df.copy()\n", + " metrics_df_display[numeric_columns] = metrics_df_display[numeric_columns].round(4)\n", + " \n", + " display(metrics_df_display)\n", + " \n", + " # Save to CSV\n", + " metrics_df.to_csv('dataset_comparison_metrics.csv', index=False)\n", + " print(\"\\nMetrics saved to 'dataset_comparison_metrics.csv'\")\n", + "else:\n", + " print(\"No metrics data available to display.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Confusion Matrices\n", + "\n", + "Create confusion matrices for each dataset comparison to visualize the classification performance.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create confusion matrices for all datasets\n", + "n_datasets = len([name for name, df in datasets.items() if 'label' in df.columns])\n", + "if n_datasets > 0:\n", + " fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n", + " axes = axes.flatten()\n", + " \n", + " plot_idx = 0\n", + " \n", + " for name, df in datasets.items():\n", + " if 'label' in df.columns:\n", + " # Get labels\n", + " y_true = reference_labels\n", + " y_pred = df['label'].values\n", + " \n", + " # Ensure same length\n", + " min_len = min(len(y_true), len(y_pred))\n", + " y_true = y_true[:min_len]\n", + " y_pred = y_pred[:min_len]\n", + " \n", + " # Create confusion matrix\n", + " cm = confusion_matrix(y_true, y_pred)\n", + " \n", + " # Plot confusion matrix\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['0', '1'])\n", + " disp.plot(ax=axes[plot_idx], cmap='Blues', values_format='d')\n", + " axes[plot_idx].set_title(f'{name}\\nvs Reference Dataset', fontweight='bold')\n", + " \n", + " plot_idx += 1\n", + " \n", + " # Hide unused subplots\n", + " for i in range(plot_idx, len(axes)):\n", + " axes[i].set_visible(False)\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + "else:\n", + " print(\"No datasets with 'label' column found for confusion matrix creation.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Detailed Classification Reports\n", + "\n", + "Generate detailed classification reports for each dataset comparison.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "CLASSIFICATION REPORT: First Classifier\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " Class 0 1.00 0.98 0.99 3538\n", + " Class 1 0.96 1.00 0.98 1418\n", + "\n", + " accuracy 0.99 4956\n", + " macro avg 0.98 0.99 0.99 4956\n", + "weighted avg 0.99 0.99 0.99 4956\n", + "\n", + "Total samples compared: 4956\n", + "Agreement rate: 0.9891\n", + "Total disagreements: 54 (1.09%)\n", + "\n", + "Disagreement breakdown:\n", + "Reference Predicted\n", + "0 1 54\n", + "\n", + "============================================================\n", + "CLASSIFICATION REPORT: Embeddings\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " Class 0 0.99 0.91 0.95 3538\n", + " Class 1 0.80 0.98 0.88 1418\n", + "\n", + " accuracy 0.93 4956\n", + " macro avg 0.90 0.94 0.91 4956\n", + "weighted avg 0.94 0.93 0.93 4956\n", + "\n", + "Total samples compared: 4956\n", + "Agreement rate: 0.9255\n", + "Total disagreements: 369 (7.45%)\n", + "\n", + "Disagreement breakdown:\n", + "Reference Predicted\n", + "0 1 336\n", + "1 0 33\n", + "\n", + "============================================================\n", + "CLASSIFICATION REPORT: Advanced Embeddings\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " Class 0 0.74 0.92 0.82 3538\n", + " Class 1 0.51 0.20 0.29 1418\n", + "\n", + " accuracy 0.72 4956\n", + " macro avg 0.63 0.56 0.56 4956\n", + "weighted avg 0.68 0.72 0.67 4956\n", + "\n", + "Total samples compared: 4956\n", + "Agreement rate: 0.7167\n", + "Total disagreements: 1404 (28.33%)\n", + "\n", + "Disagreement breakdown:\n", + "Reference Predicted\n", + "1 0 1132\n", + "0 1 272\n", + "\n", + "============================================================\n", + "CLASSIFICATION REPORT: Minus Reuters\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " Class 0 1.00 0.90 0.94 3538\n", + " Class 1 0.80 0.99 0.88 1418\n", + "\n", + " accuracy 0.92 4956\n", + " macro avg 0.90 0.94 0.91 4956\n", + "weighted avg 0.94 0.92 0.93 4956\n", + "\n", + "Total samples compared: 4956\n", + "Agreement rate: 0.9247\n", + "Total disagreements: 373 (7.53%)\n", + "\n", + "Disagreement breakdown:\n", + "Reference Predicted\n", + "0 1 359\n", + "1 0 14\n", + "\n", + "============================================================\n", + "CLASSIFICATION REPORT: XGBoost\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " Class 0 1.00 0.99 0.99 3538\n", + " Class 1 0.97 1.00 0.99 1418\n", + "\n", + " accuracy 0.99 4956\n", + " macro avg 0.99 0.99 0.99 4956\n", + "weighted avg 0.99 0.99 0.99 4956\n", + "\n", + "Total samples compared: 4956\n", + "Agreement rate: 0.9921\n", + "Total disagreements: 39 (0.79%)\n", + "\n", + "Disagreement breakdown:\n", + "Reference Predicted\n", + "0 1 38\n", + "1 0 1\n" + ] + } + ], + "source": [ + "# Generate detailed classification reports\n", + "for name, df in datasets.items():\n", + " if 'label' in df.columns:\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"CLASSIFICATION REPORT: {name}\")\n", + " print(f\"{'='*60}\")\n", + " \n", + " # Get labels\n", + " y_true = reference_labels\n", + " y_pred = df['label'].values\n", + " \n", + " # Ensure same length\n", + " min_len = min(len(y_true), len(y_pred))\n", + " y_true = y_true[:min_len]\n", + " y_pred = y_pred[:min_len]\n", + " \n", + " # Generate classification report\n", + " report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])\n", + " print(report)\n", + " \n", + " # Additional statistics\n", + " print(f\"Total samples compared: {len(y_true)}\")\n", + " print(f\"Agreement rate: {accuracy_score(y_true, y_pred):.4f}\")\n", + " \n", + " # Count disagreements\n", + " disagreements = (y_true != y_pred).sum()\n", + " print(f\"Total disagreements: {disagreements} ({disagreements/len(y_true)*100:.2f}%)\")\n", + " \n", + " # Show disagreement breakdown\n", + " if disagreements > 0:\n", + " disagreement_df = pd.DataFrame({\n", + " 'Reference': y_true[y_true != y_pred],\n", + " 'Predicted': y_pred[y_true != y_pred]\n", + " })\n", + " print(\"\\nDisagreement breakdown:\")\n", + " print(disagreement_df.value_counts().to_string())\n", + " else:\n", + " print(f\"\\nSkipping {name} - no 'label' column found\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics Visualization\n", + "\n", + "Create visualizations to compare key metrics across datasets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if 'metrics_df' in locals() and len(metrics_df) > 0:\n", + " # Create metrics comparison plots\n", + " fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n", + " \n", + " # Key metrics to plot\n", + " key_metrics = ['Accuracy', 'Precision (Class 1)', 'Recall (Class 1)', 'F1-Score (Class 1)']\n", + " \n", + " for i, metric in enumerate(key_metrics):\n", + " ax = axes[i//2, i%2]\n", + " \n", + " # Create bar plot\n", + " bars = ax.bar(metrics_df['Dataset'], metrics_df[metric], \n", + " color=plt.cm.viridis(np.linspace(0, 1, len(metrics_df))))\n", + " \n", + " ax.set_title(f'{metric} Comparison', fontweight='bold')\n", + " ax.set_ylabel(metric)\n", + " ax.set_ylim(0, 1)\n", + " \n", + " # Rotate x-axis labels for better readability\n", + " ax.tick_params(axis='x', rotation=45)\n", + " \n", + " # Add value labels on bars\n", + " for bar, value in zip(bars, metrics_df[metric]):\n", + " ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, \n", + " f'{value:.3f}', ha='center', va='bottom', fontsize=9)\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + " # Create a heatmap of all metrics\n", + " plt.figure(figsize=(12, 8))\n", + " \n", + " # Select numeric columns for heatmap\n", + " heatmap_data = metrics_df.set_index('Dataset')[numeric_columns].T\n", + " \n", + " sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd', fmt='.3f', \n", + " cbar_kws={'label': 'Metric Value'})\n", + " plt.title('Metrics Heatmap - All Datasets vs Reference', fontweight='bold', fontsize=14)\n", + " plt.xlabel('Dataset')\n", + " plt.ylabel('Metric')\n", + " plt.xticks(rotation=45)\n", + " plt.yticks(rotation=0)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "else:\n", + " print(\"No metrics data available for visualization.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary and Conclusions\n", + "\n", + "Based on the analysis above, we can draw the following conclusions:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "ANALYSIS SUMMARY\n", + "================================================================================\n", + "\n", + "Best performing dataset for each metric:\n", + " Accuracy: XGBoost (0.9921)\n", + " Precision (Weighted): XGBoost (0.9923)\n", + " Recall (Weighted): XGBoost (0.9921)\n", + " F1-Score (Weighted): XGBoost (0.9922)\n", + "\n", + "Overall ranking (by F1-Score):\n", + " 1. XGBoost: 0.9922\n", + " 2. First Classifier: 0.9892\n", + " 3. Embeddings: 0.9275\n", + " 4. Minus Reuters: 0.9269\n", + " 5. Advanced Embeddings: 0.6704\n", + "\n", + "Dataset with highest agreement with reference: XGBoost (0.9921)\n", + "\n", + "Average performance across all datasets:\n", + " Accuracy: 0.9096\n", + " Precision (Weighted): 0.9068\n", + " Recall (Weighted): 0.9096\n", + " F1-Score (Weighted): 0.9012\n", + "\n", + "================================================================================\n", + "Analysis completed successfully!\n", + "================================================================================\n" + ] + } + ], + "source": [ + "if 'metrics_df' in locals() and len(metrics_df) > 0:\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"ANALYSIS SUMMARY\")\n", + " print(\"=\"*80)\n", + " \n", + " # Find best performing dataset for each metric\n", + " key_metrics = ['Accuracy', 'Precision (Weighted)', 'Recall (Weighted)', 'F1-Score (Weighted)']\n", + " \n", + " print(\"\\nBest performing dataset for each metric:\")\n", + " for metric in key_metrics:\n", + " best_idx = metrics_df[metric].idxmax()\n", + " best_dataset = metrics_df.loc[best_idx, 'Dataset']\n", + " best_value = metrics_df.loc[best_idx, metric]\n", + " print(f\" {metric}: {best_dataset} ({best_value:.4f})\")\n", + " \n", + " # Overall ranking based on F1-score\n", + " print(\"\\nOverall ranking (by F1-Score):\")\n", + " ranking = metrics_df.sort_values('F1-Score (Weighted)', ascending=False)\n", + " for i, (_, row) in enumerate(ranking.iterrows(), 1):\n", + " print(f\" {i}. {row['Dataset']}: {row['F1-Score (Weighted)']:.4f}\")\n", + " \n", + " # Dataset with highest agreement\n", + " best_accuracy_idx = metrics_df['Accuracy'].idxmax()\n", + " best_accuracy_dataset = metrics_df.loc[best_accuracy_idx, 'Dataset']\n", + " best_accuracy_value = metrics_df.loc[best_accuracy_idx, 'Accuracy']\n", + " \n", + " print(f\"\\nDataset with highest agreement with reference: {best_accuracy_dataset} ({best_accuracy_value:.4f})\")\n", + " \n", + " # Calculate average performance\n", + " avg_metrics = metrics_df[key_metrics].mean()\n", + " print(\"\\nAverage performance across all datasets:\")\n", + " for metric, value in avg_metrics.items():\n", + " print(f\" {metric}: {value:.4f}\")\n", + " \n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"Analysis completed successfully!\")\n", + " print(\"=\"*80)\n", + "else:\n", + " print(\"No analysis could be performed due to missing metrics data.\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/README.md b/README.md index a4ac7ab..76bacec 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,104 @@ -![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png) +# NLP Challenge - Fake News Classification -# PROJECT | Natural Language Processing Challenge +A comprehensive project for fake news classification using various machine learning methods and natural language processing techniques. -## Introduction +## Project Structure -Learning how to process text is a skill required for Data Scientists/AI Engineers. +### Jupyter Notebooks +- `1. data_exploration.ipynb` - Data exploration and analysis +- `2.1 enhanced_classifier.ipynb` - Enhanced classifier implementation +- `2.2 embeddings_classifier.ipynb` - Classifier with word embeddings +- `2.3 embeddings_advanced_classifier.ipynb` - Advanced classifier with embeddings +- `2.4 enhanced_classifier_minus_reuters.ipynb` - Classifier excluding Reuters data +- `2.5 Final_classifier_and_XGBoost.ipynb` - Final classifier with XGBoost +- `3. models_comparison.ipynb` - Model comparison and evaluation -In this project, you will put these skills into practice to identify whether a news headline is real or fake news. +### Scripts +- `setup.sh` - Environment setup script +- `start_notebook.sh` - Jupyter Notebook startup script -## Project Overview +## Installation and Setup -In the file `dataset/data.csv`, you will find a dataset containing news articles with the following columns: +1. Clone the repository: +```bash +git clone +cd project-nlp-challenge +``` -- **`label`**: 0 if the news is fake, 1 if the news is real. -- **`title`**: The headline of the news article. -- **`text`**: The full content of the article. -- **`subject`**: The category or topic of the news. -- **`date`**: The publication date of the article. +2. Create a virtual environment: +```bash +python -m venv venv +source venv/bin/activate # On macOS/Linux +# or +venv\Scripts\activate # On Windows +``` -Your goal is to build a classifier that is able to distinguish between the two. +3. Install dependencies: +```bash +pip install -r requirements.txt +``` -Once you have a classifier built, then use it to predict the labels for `dataset/validation_data.csv`. Generate a new file -where the label `2` has been replaced by `0` (fake) or `1` (real) according to your model. Please respect the original file format, -do not include extra columns, and respect the column separator. +4. Start Jupyter Notebook: +```bash +jupyter notebook +``` -Please ensure to split the `data.csv` into **training** and **test** datasets before using it for model training or evaluation. +## Project Description -## Guidance +This project presents a comprehensive study of fake news classification methods, including: -Like in a real life scenario, you are able to make your own choices and text treatment. -Use the techniques you have learned and the common packages to process this data and classify the text. +- Text data analysis and preprocessing +- Feature extraction using TF-IDF +- Word embeddings implementation +- Application of various machine learning algorithms +- Performance comparison of different approaches -## Deliverables +## Key Features -1. **Python Code:** Provide well-documented Python code that conducts the analysis. -2. **Predictions:** A csv file in the same format as `validation_data.csv` but with the predicted labels (0 or 1) -3. **Accuracy estimation:** Provide the teacher with your estimation of how your model will perform. -4. **Presentation:** You will present your model in a 10-minute presentation. Your teacher will provide further instructions. +- **Data Exploration**: Comprehensive analysis of the dataset structure and characteristics +- **Feature Engineering**: Multiple approaches to text feature extraction +- **Model Variety**: Implementation of traditional ML and advanced techniques +- **Performance Evaluation**: Detailed comparison of model performance metrics +- **Reproducible Research**: Well-documented notebooks with clear methodology + +## Methodology + +The project follows a systematic approach: + +1. **Data Analysis**: Understanding the dataset structure and quality +2. **Preprocessing**: Text cleaning, tokenization, and normalization +3. **Feature Extraction**: TF-IDF, word embeddings, and custom features +4. **Model Training**: Multiple algorithms including Logistic Regression, SVM, and XGBoost +5. **Evaluation**: Comprehensive performance metrics and comparison + +## Results + +The project demonstrates various approaches to fake news classification and compares their effectiveness across different metrics including accuracy, precision, recall, and F1-score. + +## Technologies Used + +- **Python 3.x** - Core programming language +- **Jupyter Notebook** - Interactive development environment +- **scikit-learn** - Machine learning library +- **pandas** - Data manipulation and analysis +- **numpy** - Numerical computing +- **matplotlib & seaborn** - Data visualization +- **XGBoost** - Gradient boosting framework +- **NLTK** - Natural language processing toolkit +- **spaCy** - Advanced NLP library + +## Dataset + +The project uses a curated dataset of news articles labeled as real or fake, providing a solid foundation for training and evaluation of classification models. + +## Contributing + +This is a research project showcasing various NLP and ML techniques for fake news detection. Feel free to explore the notebooks and adapt the methods for your own use cases. + +## Author + +Sergej + +## License + +This project is for educational and research purposes. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8874dfd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +# Основные библиотеки для машинного обучения +numpy>=1.21.0 +pandas>=1.3.0 +scikit-learn>=1.0.0 +matplotlib>=3.4.0 +seaborn>=0.11.0 + +# Jupyter Notebook +jupyter>=1.0.0 +ipykernel>=6.0.0 + +# NLP библиотеки +nltk>=3.6.0 +spacy>=3.4.0 + +# XGBoost +xgboost>=1.5.0 + +# Дополнительные утилиты +tqdm>=4.62.0 +requests>=2.25.0 + +# Обработка текста +wordcloud>=1.8.0 +textblob>=0.17.0 + +# Визуализация +plotly>=5.0.0 diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..e234d15 --- /dev/null +++ b/setup.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Setup script for the NLP Challenge project + +echo "🚀 Setting up the NLP Challenge project..." + +# Check if virtual environment exists +if [ ! -d "venv" ]; then + echo "📦 Creating virtual environment..." + python3 -m venv venv +fi + +# Activate virtual environment +echo "🔧 Activating virtual environment..." +source venv/bin/activate + +# Install required packages +echo "📚 Installing required packages..." +pip install pandas numpy matplotlib scikit-learn nltk jupyter + +# Download NLTK data +echo "📥 Downloading NLTK data..." +python -c " +import nltk +try: + nltk.data.find('tokenizers/punkt') +except LookupError: + nltk.download('punkt') + +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +try: + nltk.data.find('corpora/wordnet') +except LookupError: + nltk.download('wordnet') +print('NLTK data downloaded successfully!') +" + +echo "✅ Setup complete!" +echo "" +echo "📋 Next steps:" +echo "1. Activate the virtual environment: source venv/bin/activate" +echo "2. Start Jupyter: jupyter notebook" +echo "3. Open: data_exploration_fixed.ipynb" +echo "4. Run all cells to explore your dataset" +echo "" +echo "�� Happy exploring!" diff --git a/start_notebook.sh b/start_notebook.sh new file mode 100755 index 0000000..b54cf22 --- /dev/null +++ b/start_notebook.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Quick start script for the NLP Challenge + +echo "🚀 Starting NLP Challenge Data Exploration..." + +# Activate virtual environment +echo "🔧 Activating virtual environment..." +source venv/bin/activate + +# Download missing NLTK data +echo "📥 Downloading NLTK data..." +python -c " +import nltk +try: + nltk.data.find('corpora/wordnet') + print('WordNet already available') +except LookupError: + nltk.download('wordnet') + print('WordNet downloaded successfully') +" + +# Start Jupyter +echo "📊 Starting Jupyter Notebook..." +echo "🌐 Opening browser to Jupyter..." +echo "📁 Open: data_exploration_fixed_v3.ipynb" +echo "" +echo "💡 Tips:" +echo "- Run all cells sequentially from top to bottom" +echo "- Some cells may take time to execute" +echo "- All visualizations will be displayed automatically" +echo "" +echo "🎯 Happy exploring!" + +jupyter notebook