slight adjustments

72248df0 · Denis Cvach · 260ed044 · 72248df0
Commit 72248df0 authored 6 months ago by Denis Cvach
--- a/05a_NLP_text_classification/NLP_classifyer.ipynb
+++ b/05a_NLP_text_classification/NLP_classifyer.ipynb
@@ -4,8 +4,8 @@
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2025-01-11T12:18:39.163081Z",
+     "end_time": "2025-01-13T15:39:33.191089Z",
-     "start_time": "2025-01-11T12:18:36.609389Z"
+     "start_time": "2025-01-13T15:39:33.186596Z"
    }
   },
   "source": [
@@ -16,9 +16,20 @@
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.compose import ColumnTransformer"
-    "\n",
+   ],
-    "# Load data\n",
+   "outputs": [],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.249911Z",
+     "start_time": "2025-01-13T15:39:33.219395Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "data_path = 'data/Movie_Overview_Classification.csv'\n",
    "\n",
    "# Read the file, allowing malformed rows to be skipped\n",
@@ -34,11 +45,156 @@
    "\n",
    "    print(\"Data loaded and cleaned successfully.\")\n",
    "except Exception as e:\n",
-    "    print(\"Error during cleaning:\", str(e))\n",
+    "    print(\"Error during cleaning:\", str(e))"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data loaded and cleaned successfully.\n"
+     ]
+    }
+   ],
+   "execution_count": 7
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.364366Z",
+     "start_time": "2025-01-13T15:39:33.344373Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Check the first few rows of the data\n",
-    "data.head()\n",
+    "data.head()"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "                                                                                                                                                                                                                                                               id  \\\n",
+       "1  |When Lou                                          who has become the \"father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...   who inspired by the death of their former man...   \n",
+       "17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...   and Jack reveals that he wants Lisa to change...   \n",
+       "34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...                   with a new crop of award-winning   \n",
+       "74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                                                   an internet site   \n",
+       "99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...       Sara is engaged to real-life hero Max Mordon   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                         overview  \\\n",
+       "1  |When Lou                                          who has become the \"father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...   get back on the stage for one concert in New ...   \n",
+       "17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...   Lisa's father will be killed by a hit man. Li...   \n",
+       "34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...        visionary filmmakers from around the globe.   \n",
+       "74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                   and ads in newspapers and magazines. Will peo...   \n",
+       "99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...   but Max has secretly betrayed the Professor b...   \n",
+       "\n",
+       "                                                                                                                                                                                                                genre_Drama  \n",
+       "1  |When Lou                                          who has become the \"father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...          0.0  \n",
+       "17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...          0.0  \n",
+       "34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...          0.0  \n",
+       "74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                          0.0  \n",
+       "99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...          0.0  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>genre_Drama</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <th>|When Lou</th>\n",
+       "      <th>who has become the \"father of the Internet</th>\n",
+       "      <th>is shot by an unknown assailant, Jacob and Nick fire up the time machine again to save their friend.|,0\\r\\n2,|Mia Thermopolis is now a college graduate and on her way to Genovia to take up her duties as princess. Her best friend Lilly also joins her for the summer. Mia continues her 'princess lessons'- riding horses side-saddle, archery, and other royal. But her complicated life is turned upside down once again when she not only learns that she is to take the crown as queen earlier than expected...|,1\\r\\n3,|Under the direction of a ruthless instructor, a talented young drummer begins to pursue perfection at any cost, even his humanity.|,1\\r\\n4,|Vidya Bagchi (Vidya Balan) arrives in Kolkata from London to find her missing husband Arnab Bagchi. Seven months pregnant and alone in a festive city, she begins a relentless search for her husband. With nothing to rely on except fragments from her memories about him, all clues seem to reach a dead end when everyone tries to convince Vidya that her husband does not exist. She slowly realises that nothing is what it seems. In a city soaked in lies, Vidya is determined to unravel the truth about her husband - for herself and her unborn child - even at the cost of her own life.|,1\\r\\n5,|Marine Boy is the story of a former national swimmer who finds himself in debt, and out of desperation is employed as a mule by a gangster boss, smuggling drugs by sea.|,0\\r\\n6,|Pinocchio and his friends, a glow worm and a marionette, search for a magic music box. However, so are the evil Scalawag and the Emperor of the Night.|,0\\r\\n7,|A young girl buys an antique box at a yard sale, unaware that inside the collectible lives a malicious ancient spirit. The girl's father teams with his ex-wife to find a way to end the curse upon their child.|,0\\r\\n8,|A chronicle which provides a rare window into the international perception of the Iraq War, courtesy of Al Jazeera, the Arab world's most popular news outlet. Roundly criticized by Cabinet members and Pentagon officials for reporting with a pro-Iraqi bias, and strongly condemned for frequently airing civilian causalities as well as footage of American POWs, the station has revealed (and continues to show the world) everything about the Iraq War that the Bush administration did not want it to see.|,0\\r\\n9,|After telling the story of Flint's last journey to young Jim Hawkins, Billy Bones has a heart attack and dies just as Jim and his friends are attacked by pirates. The gang escapes into the town where they hire out a boat and crew to find the hidden treasure, which was revealed by Bones before he died. On their voyage across the seas, they soon find out that not everyone on board can be trusted.|,0\\r\\n10,|In A Mighty Wind\"</th>\n",
+       "      <th>director Christopher Guest reunites the team from \"Best In Show\" and \"Waiting for Guffman\" to tell tell the story of 60's-era folk musicians</th>\n",
+       "      <td>who inspired by the death of their former man...</td>\n",
+       "      <td>get back on the stage for one concert in New ...</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <th>|After attending the funeral of her grandmother in Dallas</th>\n",
+       "      <th>the Lux Atlantic Hotel manager Lisa is waiting for a flight to Miami. Due to the bad weather and consequent flight delay</th>\n",
+       "      <th>she meets in the airport bar Jack Rippner</th>\n",
+       "      <th>who is also in the waiting list. They sit together in the plane</th>\n",
+       "      <td>and Jack reveals that he wants Lisa to change...</td>\n",
+       "      <td>Lisa's father will be killed by a hit man. Li...</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <th>|Taking all that was great from the first installment</th>\n",
+       "      <th>ABCs OF DEATH 2 aims to be a wilder</th>\n",
+       "      <th>leaner</th>\n",
+       "      <th>faster paced and even more entertaining anthology this time around</th>\n",
+       "      <td>with a new crop of award-winning</td>\n",
+       "      <td>visionary filmmakers from around the globe.</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>74</th>\n",
+       "      <th>|Two students from the Czech Film Academy commission a leading advertising agency to organize a huge campaign for the opening of a new supermarket named Czech Dream. The supermarket however does not exist and is not meant to. The advertising campaign includes radio and television ads</th>\n",
+       "      <th>posters</th>\n",
+       "      <th>flyers with photos of fake Czech Dream products</th>\n",
+       "      <th>a promotional song</th>\n",
+       "      <td>an internet site</td>\n",
+       "      <td>and ads in newspapers and magazines. Will peo...</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <th>|Tad is a celebrity archeologist and adventurer just like his hero Max Mordon... in his dreams! In reality</th>\n",
+       "      <th>Tad is a Chicago construction worker. One day</th>\n",
+       "      <th>however</th>\n",
+       "      <th>he is mistaken for a real Professor and takes his place on a flight to Peru in search of the Lost City of Paititi. Professor Lavrof and his beautiful daughter Sara are waiting for the famous professor to crack the code. Unfortunately for Tad</th>\n",
+       "      <td>Sara is engaged to real-life hero Max Mordon</td>\n",
+       "      <td>but Max has secretly betrayed the Professor b...</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.401745Z",
+     "start_time": "2025-01-13T15:39:33.395670Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Pre-processing\n",
    "# Check for missing values\n",
    "missing_columns = data.columns[data.isnull().any()].tolist()\n",
@@ -47,18 +203,62 @@
    "# Replace missing values if any\n",
    "if missing_columns:\n",
    "    imputer = SimpleImputer(strategy='most_frequent')\n",
-    "    data[missing_columns] = imputer.fit_transform(data[missing_columns])\n",
+    "    data[missing_columns] = imputer.fit_transform(data[missing_columns])"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Columns with missing values: []\n"
+     ]
+    }
+   ],
+   "execution_count": 9
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.447745Z",
+     "start_time": "2025-01-13T15:39:33.443745Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Define features and target\n",
    "X = data['overview']  # Assuming the overview column contains text data\n",
-    "y = data['genre_Drama']     # Assuming the genre column contains labels\n",
+    "y = data['genre_Drama']     # Assuming the genre column contains labels"
-    "\n",
+   ],
+   "outputs": [],
+   "execution_count": 10
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.460934Z",
+     "start_time": "2025-01-13T15:39:33.453768Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Implement a pipeline with TfidfVectorizer and RandomForestClassifier\n",
    "pipeline = Pipeline([\n",
    "    ('tfidf', TfidfVectorizer(stop_words='english')),\n",
    "    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))\n",
-    "])\n",
+    "])"
-    "\n",
+   ],
+   "outputs": [],
+   "execution_count": 11
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.797554Z",
+     "start_time": "2025-01-13T15:39:33.612084Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
@@ -66,54 +266,612 @@
    "pipeline.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the test set\n",
-    "y_pred = pipeline.predict(X_test)\n",
+    "y_pred = pipeline.predict(X_test)"
-    "\n",
+   ],
+   "outputs": [],
+   "execution_count": 12
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:33.807942Z",
+     "start_time": "2025-01-13T15:39:33.803068Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Measure accuracy\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
-    "print(\"Accuracy:\", accuracy)\n",
+    "print(\"Accuracy:\", accuracy)"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.4883720930232558\n"
+     ]
+    }
+   ],
+   "execution_count": 13
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:34.937492Z",
+     "start_time": "2025-01-13T15:39:33.826837Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Cross-validation\n",
    "cv_scores = cross_val_score(pipeline, X, y, cv=5)\n",
    "print(\"Cross-validation scores:\", cv_scores)\n",
-    "print(\"Mean CV accuracy:\", cv_scores.mean())\n",
+    "print(\"Mean CV accuracy:\", cv_scores.mean())"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]\n",
+      "Mean CV accuracy: 0.49302325581395345\n"
+     ]
+    }
+   ],
+   "execution_count": 14
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:35.887643Z",
+     "start_time": "2025-01-13T15:39:34.971743Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# Cross-validation\n",
+    "cv_scores = cross_val_score(pipeline, X, y, cv=5)\n",
+    "print(\"Cross-validation scores:\", cv_scores)\n",
+    "print(\"Mean CV accuracy:\", cv_scores.mean())"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]\n",
+      "Mean CV accuracy: 0.49302325581395345\n"
+     ]
+    }
+   ],
+   "execution_count": 15
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:35.906683Z",
+     "start_time": "2025-01-13T15:39:35.902653Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Optimization: Modify parameters and retry\n",
    "optimized_pipeline = Pipeline([\n",
    "    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),\n",
    "    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42))\n",
-    "])\n",
+    "])"
-    "\n",
+   ],
+   "outputs": [],
+   "execution_count": 16
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:36.161854Z",
+     "start_time": "2025-01-13T15:39:35.931782Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Train optimized pipeline\n",
-    "optimized_pipeline.fit(X_train, y_train)\n",
+    "optimized_pipeline.fit(X_train, y_train)"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('tfidf',\n",
+       "                 TfidfVectorizer(max_features=5000, stop_words='english')),\n",
+       "                ('classifier',\n",
+       "                 RandomForestClassifier(max_depth=20, n_estimators=200,\n",
+       "                                        random_state=42))])"
+      ],
+      "text/html": [
+       "<style>#sk-container-id-1 {\n",
+       "  /* Definition of color scheme common for light and dark mode */\n",
+       "  --sklearn-color-text: black;\n",
+       "  --sklearn-color-line: gray;\n",
+       "  /* Definition of color scheme for unfitted estimators */\n",
+       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
+       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
+       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
+       "  --sklearn-color-unfitted-level-3: chocolate;\n",
+       "  /* Definition of color scheme for fitted estimators */\n",
+       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
+       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
+       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
+       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
+       "\n",
+       "  /* Specific color for light theme */\n",
+       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
+       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-icon: #696969;\n",
+       "\n",
+       "  @media (prefers-color-scheme: dark) {\n",
+       "    /* Redefinition of color scheme for dark theme */\n",
+       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
+       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-icon: #878787;\n",
+       "  }\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 pre {\n",
+       "  padding: 0;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-hidden--visually {\n",
+       "  border: 0;\n",
+       "  clip: rect(1px 1px 1px 1px);\n",
+       "  clip: rect(1px, 1px, 1px, 1px);\n",
+       "  height: 1px;\n",
+       "  margin: -1px;\n",
+       "  overflow: hidden;\n",
+       "  padding: 0;\n",
+       "  position: absolute;\n",
+       "  width: 1px;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
+       "  border: 1px dashed var(--sklearn-color-line);\n",
+       "  margin: 0 0.4em 0.5em 0.4em;\n",
+       "  box-sizing: border-box;\n",
+       "  padding-bottom: 0.4em;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-container {\n",
+       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
+       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
+       "     so we also need the `!important` here to be able to override the\n",
+       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
+       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
+       "  display: inline-block !important;\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
+       "  display: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-parallel-item,\n",
+       "div.sk-serial,\n",
+       "div.sk-item {\n",
+       "  /* draw centered vertical line to link estimators */\n",
+       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
+       "  background-size: 2px 100%;\n",
+       "  background-repeat: no-repeat;\n",
+       "  background-position: center center;\n",
+       "}\n",
+       "\n",
+       "/* Parallel-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item::after {\n",
+       "  content: \"\";\n",
+       "  width: 100%;\n",
+       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
+       "  flex-grow: 1;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel {\n",
+       "  display: flex;\n",
+       "  align-items: stretch;\n",
+       "  justify-content: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
+       "  align-self: flex-end;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
+       "  align-self: flex-start;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
+       "  width: 0;\n",
+       "}\n",
+       "\n",
+       "/* Serial-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-serial {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "  align-items: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  padding-right: 1em;\n",
+       "  padding-left: 1em;\n",
+       "}\n",
+       "\n",
+       "\n",
+       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
+       "clickable and can be expanded/collapsed.\n",
+       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
+       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
+       "*/\n",
+       "\n",
+       "/* Pipeline and ColumnTransformer style (default) */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable {\n",
+       "  /* Default theme specific background. It is overwritten whether we have a\n",
+       "  specific estimator or a Pipeline/ColumnTransformer */\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable label */\n",
+       "#sk-container-id-1 label.sk-toggleable__label {\n",
+       "  cursor: pointer;\n",
+       "  display: block;\n",
+       "  width: 100%;\n",
+       "  margin-bottom: 0;\n",
+       "  padding: 0.5em;\n",
+       "  box-sizing: border-box;\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
+       "  /* Arrow on the left of the label */\n",
+       "  content: \"▸\";\n",
+       "  float: left;\n",
+       "  margin-right: 0.25em;\n",
+       "  color: var(--sklearn-color-icon);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable content - dropdown */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content {\n",
+       "  max-height: 0;\n",
+       "  max-width: 0;\n",
+       "  overflow: hidden;\n",
+       "  text-align: left;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
+       "  margin: 0.2em;\n",
+       "  border-radius: 0.25em;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
+       "  /* Expand drop-down */\n",
+       "  max-height: 200px;\n",
+       "  max-width: 100%;\n",
+       "  overflow: auto;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
+       "  content: \"▾\";\n",
+       "}\n",
+       "\n",
+       "/* Pipeline/ColumnTransformer-specific style */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific style */\n",
+       "\n",
+       "/* Colorize estimator box */\n",
+       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  /* The background is the default theme color */\n",
+       "  color: var(--sklearn-color-text-on-default-background);\n",
+       "}\n",
+       "\n",
+       "/* On hover, darken the color of the background */\n",
+       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Label box, darken color on hover, fitted */\n",
+       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator label */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  font-family: monospace;\n",
+       "  font-weight: bold;\n",
+       "  display: inline-block;\n",
+       "  line-height: 1.2em;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label-container {\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific */\n",
+       "#sk-container-id-1 div.sk-estimator {\n",
+       "  font-family: monospace;\n",
+       "  border: 1px dotted var(--sklearn-color-border-box);\n",
+       "  border-radius: 0.25em;\n",
+       "  box-sizing: border-box;\n",
+       "  margin-bottom: 0.5em;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "/* on hover */\n",
+       "#sk-container-id-1 div.sk-estimator:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
+       "\n",
+       "/* Common style for \"i\" and \"?\" */\n",
+       "\n",
+       ".sk-estimator-doc-link,\n",
+       "a:link.sk-estimator-doc-link,\n",
+       "a:visited.sk-estimator-doc-link {\n",
+       "  float: right;\n",
+       "  font-size: smaller;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1em;\n",
+       "  height: 1em;\n",
+       "  width: 1em;\n",
+       "  text-decoration: none !important;\n",
+       "  margin-left: 1ex;\n",
+       "  /* unfitted */\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted,\n",
+       "a:link.sk-estimator-doc-link.fitted,\n",
+       "a:visited.sk-estimator-doc-link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "/* Span, style for the box shown on hovering the info icon */\n",
+       ".sk-estimator-doc-link span {\n",
+       "  display: none;\n",
+       "  z-index: 9999;\n",
+       "  position: relative;\n",
+       "  font-weight: normal;\n",
+       "  right: .2ex;\n",
+       "  padding: .5ex;\n",
+       "  margin: .5ex;\n",
+       "  width: min-content;\n",
+       "  min-width: 20ex;\n",
+       "  max-width: 50ex;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  box-shadow: 2pt 2pt 4pt #999;\n",
+       "  /* unfitted */\n",
+       "  background: var(--sklearn-color-unfitted-level-0);\n",
+       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted span {\n",
+       "  /* fitted */\n",
+       "  background: var(--sklearn-color-fitted-level-0);\n",
+       "  border: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link:hover span {\n",
+       "  display: block;\n",
+       "}\n",
+       "\n",
+       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link {\n",
+       "  float: right;\n",
+       "  font-size: 1rem;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1rem;\n",
+       "  height: 1rem;\n",
+       "  width: 1rem;\n",
+       "  text-decoration: none;\n",
+       "  /* unfitted */\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;tfidf&#x27;,\n",
+       "                 TfidfVectorizer(max_features=5000, stop_words=&#x27;english&#x27;)),\n",
+       "                (&#x27;classifier&#x27;,\n",
+       "                 RandomForestClassifier(max_depth=20, n_estimators=200,\n",
+       "                                        random_state=42))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;tfidf&#x27;,\n",
+       "                 TfidfVectorizer(max_features=5000, stop_words=&#x27;english&#x27;)),\n",
+       "                (&#x27;classifier&#x27;,\n",
+       "                 RandomForestClassifier(max_depth=20, n_estimators=200,\n",
+       "                                        random_state=42))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;TfidfVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\">?<span>Documentation for TfidfVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>TfidfVectorizer(max_features=5000, stop_words=&#x27;english&#x27;)</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(max_depth=20, n_estimators=200, random_state=42)</pre></div> </div></div></div></div></div></div>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 17
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:36.240197Z",
+     "start_time": "2025-01-13T15:39:36.228297Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Measure optimized accuracy\n",
    "optimized_y_pred = optimized_pipeline.predict(X_test)\n",
    "optimized_accuracy = accuracy_score(y_test, optimized_y_pred)\n",
-    "print(\"Optimized Accuracy:\", optimized_accuracy)\n",
+    "print(\"Optimized Accuracy:\", optimized_accuracy)"
-    "\n",
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimized Accuracy: 0.4883720930232558\n"
+     ]
+    }
+   ],
+   "execution_count": 18
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:37.512194Z",
+     "start_time": "2025-01-13T15:39:36.251209Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
    "# Optimized cross-validation\n",
    "optimized_cv_scores = cross_val_score(optimized_pipeline, X, y, cv=5)\n",
    "print(\"Optimized Cross-validation scores:\", optimized_cv_scores)\n",
-    "print(\"Mean Optimized CV accuracy:\", optimized_cv_scores.mean())\n",
+    "print(\"Mean Optimized CV accuracy:\", optimized_cv_scores.mean())"
-    "\n"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Data loaded and cleaned successfully.\n",
-      "Columns with missing values: []\n",
-      "Accuracy: 0.4883720930232558\n",
-      "Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]\n",
-      "Mean CV accuracy: 0.49302325581395345\n",
-      "Optimized Accuracy: 0.4883720930232558\n",
      "Optimized Cross-validation scores: [0.51162791 0.51162791 0.53488372 0.51162791 0.44186047]\n",
      "Mean Optimized CV accuracy: 0.5023255813953489\n"
     ]
    }
   ],
-   "execution_count": 6
+   "execution_count": 19
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-01-13T15:39:37.532659Z",
+     "start_time": "2025-01-13T15:39:37.529682Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "",
+   "outputs": [],
+   "execution_count": null
  }
 ],
 "metadata": {

 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.metrics import accuracy_score
 from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
+```
+%% Cell type:code id: tags:
-# Load data
+``` python
 data_path = 'data/Movie_Overview_Classification.csv'
 # Read the file, allowing malformed rows to be skipped
 try:
    # Load the dataset with a custom delimiter and fix potential issues
    data = pd.read_csv(data_path, sep=',', on_bad_lines='skip')
    # Remove leading and trailing pipes if they exist in the 'overview' column
    data['overview'] = data['overview'].str.strip('|')
    # Drop any rows with NaN values in essential columns (e.g., 'overview', 'genre_Drama')
    data = data.dropna(subset=['overview', 'genre_Drama'])
    print("Data loaded and cleaned successfully.")
 except Exception as e:
    print("Error during cleaning:", str(e))
+```
+%% Output
+    Data loaded and cleaned successfully.
+%% Cell type:code id: tags:
+``` python
 # Check the first few rows of the data
 data.head()
+```
+%% Output
+                                                                                                                                                                                                                                                                   id  \
+    1  |When Lou                                          who has become the "father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...   who inspired by the death of their former man...
+    17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...   and Jack reveals that he wants Lisa to change...
+    34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...                   with a new crop of award-winning
+    74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                                                   an internet site
+    99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...       Sara is engaged to real-life hero Max Mordon
+                                                                                                                                                                                                                                                             overview  \
+    1  |When Lou                                          who has become the "father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...   get back on the stage for one concert in New ...
+    17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...   Lisa's father will be killed by a hit man. Li...
+    34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...        visionary filmmakers from around the globe.
+    74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                   and ads in newspapers and magazines. Will peo...
+    99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...   but Max has secretly betrayed the Professor b...
+                                                                                                                                                                                                                    genre_Drama
+    1  |When Lou                                          who has become the "father of the Internet         is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr...          0.0
+    17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner          who is also in the waiting list. They sit toget...          0.0
+    34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder                leaner                                             faster paced and even more entertaining antholo...          0.0
+    74 |Two students from the Czech Film Academy commi... posters                                            flyers with photos of fake Czech Dream products    a promotional song                                          0.0
+    99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day      however                                            he is mistaken for a real Professor and takes h...          0.0
+%% Cell type:code id: tags:
+``` python
 # Pre-processing
 # Check for missing values
 missing_columns = data.columns[data.isnull().any()].tolist()
 print("Columns with missing values:", missing_columns)
 # Replace missing values if any
 if missing_columns:
    imputer = SimpleImputer(strategy='most_frequent')
    data[missing_columns] = imputer.fit_transform(data[missing_columns])
+```
+%% Output
+    Columns with missing values: []
+%% Cell type:code id: tags:
+``` python
 # Define features and target
 X = data['overview']  # Assuming the overview column contains text data
 y = data['genre_Drama']     # Assuming the genre column contains labels
+```
+%% Cell type:code id: tags:
+``` python
 # Implement a pipeline with TfidfVectorizer and RandomForestClassifier
 pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
 ])
+```
+%% Cell type:code id: tags:
+``` python
 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 # Train the pipeline
 pipeline.fit(X_train, y_train)
 # Predict on the test set
 y_pred = pipeline.predict(X_test)
+```
+%% Cell type:code id: tags:
+``` python
 # Measure accuracy
 accuracy = accuracy_score(y_test, y_pred)
 print("Accuracy:", accuracy)
+```
+%% Output
+    Accuracy: 0.4883720930232558
+%% Cell type:code id: tags:
+``` python
 # Cross-validation
 cv_scores = cross_val_score(pipeline, X, y, cv=5)
 print("Cross-validation scores:", cv_scores)
 print("Mean CV accuracy:", cv_scores.mean())
+```
+%% Output
+    Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
+    Mean CV accuracy: 0.49302325581395345
+%% Cell type:code id: tags:
+``` python
+# Cross-validation
+cv_scores = cross_val_score(pipeline, X, y, cv=5)
+print("Cross-validation scores:", cv_scores)
+print("Mean CV accuracy:", cv_scores.mean())
+```
+%% Output
+    Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
+    Mean CV accuracy: 0.49302325581395345
+%% Cell type:code id: tags:
+``` python
 # Optimization: Modify parameters and retry
 optimized_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42))
 ])
+```
+%% Cell type:code id: tags:
+``` python
 # Train optimized pipeline
 optimized_pipeline.fit(X_train, y_train)
+```
+%% Output
+    Pipeline(steps=[('tfidf',
+                     TfidfVectorizer(max_features=5000, stop_words='english')),
+                    ('classifier',
+                     RandomForestClassifier(max_depth=20, n_estimators=200,
+                                            random_state=42))])
+%% Cell type:code id: tags:
+``` python
 # Measure optimized accuracy
 optimized_y_pred = optimized_pipeline.predict(X_test)
 optimized_accuracy = accuracy_score(y_test, optimized_y_pred)
 print("Optimized Accuracy:", optimized_accuracy)
+```
+%% Output
+    Optimized Accuracy: 0.4883720930232558
+%% Cell type:code id: tags:
+``` python
 # Optimized cross-validation
 optimized_cv_scores = cross_val_score(optimized_pipeline, X, y, cv=5)
 print("Optimized Cross-validation scores:", optimized_cv_scores)
 print("Mean Optimized CV accuracy:", optimized_cv_scores.mean())
 ```
 %% Output
-    Data loaded and cleaned successfully.
-    Columns with missing values: []
-    Accuracy: 0.4883720930232558
-    Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
-    Mean CV accuracy: 0.49302325581395345
-    Optimized Accuracy: 0.4883720930232558
    Optimized Cross-validation scores: [0.51162791 0.51162791 0.53488372 0.51162791 0.44186047]
    Mean Optimized CV accuracy: 0.5023255813953489
+%% Cell type:code id: tags:
+``` python
+```