Skip to content
Snippets Groups Projects
Commit 72248df0 authored by Denis Cvach's avatar Denis Cvach
Browse files

slight adjustments

parent 260ed044
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
```
%% Cell type:code id: tags:
# Load data ``` python
data_path = 'data/Movie_Overview_Classification.csv' data_path = 'data/Movie_Overview_Classification.csv'
# Read the file, allowing malformed rows to be skipped # Read the file, allowing malformed rows to be skipped
try: try:
# Load the dataset with a custom delimiter and fix potential issues # Load the dataset with a custom delimiter and fix potential issues
data = pd.read_csv(data_path, sep=',', on_bad_lines='skip') data = pd.read_csv(data_path, sep=',', on_bad_lines='skip')
# Remove leading and trailing pipes if they exist in the 'overview' column # Remove leading and trailing pipes if they exist in the 'overview' column
data['overview'] = data['overview'].str.strip('|') data['overview'] = data['overview'].str.strip('|')
# Drop any rows with NaN values in essential columns (e.g., 'overview', 'genre_Drama') # Drop any rows with NaN values in essential columns (e.g., 'overview', 'genre_Drama')
data = data.dropna(subset=['overview', 'genre_Drama']) data = data.dropna(subset=['overview', 'genre_Drama'])
print("Data loaded and cleaned successfully.") print("Data loaded and cleaned successfully.")
except Exception as e: except Exception as e:
print("Error during cleaning:", str(e)) print("Error during cleaning:", str(e))
```
%% Output
Data loaded and cleaned successfully.
%% Cell type:code id: tags:
``` python
# Check the first few rows of the data # Check the first few rows of the data
data.head() data.head()
```
%% Output
id \
1 |When Lou who has become the "father of the Internet is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr... who inspired by the death of their former man...
17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner who is also in the waiting list. They sit toget... and Jack reveals that he wants Lisa to change...
34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder leaner faster paced and even more entertaining antholo... with a new crop of award-winning
74 |Two students from the Czech Film Academy commi... posters flyers with photos of fake Czech Dream products a promotional song an internet site
99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day however he is mistaken for a real Professor and takes h... Sara is engaged to real-life hero Max Mordon
overview \
1 |When Lou who has become the "father of the Internet is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr... get back on the stage for one concert in New ...
17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner who is also in the waiting list. They sit toget... Lisa's father will be killed by a hit man. Li...
34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder leaner faster paced and even more entertaining antholo... visionary filmmakers from around the globe.
74 |Two students from the Czech Film Academy commi... posters flyers with photos of fake Czech Dream products a promotional song and ads in newspapers and magazines. Will peo...
99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day however he is mistaken for a real Professor and takes h... but Max has secretly betrayed the Professor b...
genre_Drama
1 |When Lou who has become the "father of the Internet is shot by an unknown assailant, Jacob and Nick... director Christopher Guest reunites the team fr... 0.0
17 |After attending the funeral of her grandmother... the Lux Atlantic Hotel manager Lisa is waiting ... she meets in the airport bar Jack Rippner who is also in the waiting list. They sit toget... 0.0
34 |Taking all that was great from the first insta... ABCs OF DEATH 2 aims to be a wilder leaner faster paced and even more entertaining antholo... 0.0
74 |Two students from the Czech Film Academy commi... posters flyers with photos of fake Czech Dream products a promotional song 0.0
99 |Tad is a celebrity archeologist and adventurer... Tad is a Chicago construction worker. One day however he is mistaken for a real Professor and takes h... 0.0
%% Cell type:code id: tags:
``` python
# Pre-processing # Pre-processing
# Check for missing values # Check for missing values
missing_columns = data.columns[data.isnull().any()].tolist() missing_columns = data.columns[data.isnull().any()].tolist()
print("Columns with missing values:", missing_columns) print("Columns with missing values:", missing_columns)
# Replace missing values if any # Replace missing values if any
if missing_columns: if missing_columns:
imputer = SimpleImputer(strategy='most_frequent') imputer = SimpleImputer(strategy='most_frequent')
data[missing_columns] = imputer.fit_transform(data[missing_columns]) data[missing_columns] = imputer.fit_transform(data[missing_columns])
```
%% Output
Columns with missing values: []
%% Cell type:code id: tags:
``` python
# Define features and target # Define features and target
X = data['overview'] # Assuming the overview column contains text data X = data['overview'] # Assuming the overview column contains text data
y = data['genre_Drama'] # Assuming the genre column contains labels y = data['genre_Drama'] # Assuming the genre column contains labels
```
%% Cell type:code id: tags:
``` python
# Implement a pipeline with TfidfVectorizer and RandomForestClassifier # Implement a pipeline with TfidfVectorizer and RandomForestClassifier
pipeline = Pipeline([ pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english')), ('tfidf', TfidfVectorizer(stop_words='english')),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
]) ])
```
%% Cell type:code id: tags:
``` python
# Split data into training and testing sets # Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the pipeline # Train the pipeline
pipeline.fit(X_train, y_train) pipeline.fit(X_train, y_train)
# Predict on the test set # Predict on the test set
y_pred = pipeline.predict(X_test) y_pred = pipeline.predict(X_test)
```
%% Cell type:code id: tags:
``` python
# Measure accuracy # Measure accuracy
accuracy = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy) print("Accuracy:", accuracy)
```
%% Output
Accuracy: 0.4883720930232558
%% Cell type:code id: tags:
``` python
# Cross-validation # Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5) cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores:", cv_scores) print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean()) print("Mean CV accuracy:", cv_scores.mean())
```
%% Output
Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
Mean CV accuracy: 0.49302325581395345
%% Cell type:code id: tags:
``` python
# Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())
```
%% Output
Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
Mean CV accuracy: 0.49302325581395345
%% Cell type:code id: tags:
``` python
# Optimization: Modify parameters and retry # Optimization: Modify parameters and retry
optimized_pipeline = Pipeline([ optimized_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)), ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)) ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42))
]) ])
```
%% Cell type:code id: tags:
``` python
# Train optimized pipeline # Train optimized pipeline
optimized_pipeline.fit(X_train, y_train) optimized_pipeline.fit(X_train, y_train)
```
%% Output
Pipeline(steps=[('tfidf',
TfidfVectorizer(max_features=5000, stop_words='english')),
('classifier',
RandomForestClassifier(max_depth=20, n_estimators=200,
random_state=42))])
%% Cell type:code id: tags:
``` python
# Measure optimized accuracy # Measure optimized accuracy
optimized_y_pred = optimized_pipeline.predict(X_test) optimized_y_pred = optimized_pipeline.predict(X_test)
optimized_accuracy = accuracy_score(y_test, optimized_y_pred) optimized_accuracy = accuracy_score(y_test, optimized_y_pred)
print("Optimized Accuracy:", optimized_accuracy) print("Optimized Accuracy:", optimized_accuracy)
```
%% Output
Optimized Accuracy: 0.4883720930232558
%% Cell type:code id: tags:
``` python
# Optimized cross-validation # Optimized cross-validation
optimized_cv_scores = cross_val_score(optimized_pipeline, X, y, cv=5) optimized_cv_scores = cross_val_score(optimized_pipeline, X, y, cv=5)
print("Optimized Cross-validation scores:", optimized_cv_scores) print("Optimized Cross-validation scores:", optimized_cv_scores)
print("Mean Optimized CV accuracy:", optimized_cv_scores.mean()) print("Mean Optimized CV accuracy:", optimized_cv_scores.mean())
``` ```
%% Output %% Output
Data loaded and cleaned successfully.
Columns with missing values: []
Accuracy: 0.4883720930232558
Cross-validation scores: [0.53488372 0.51162791 0.48837209 0.48837209 0.44186047]
Mean CV accuracy: 0.49302325581395345
Optimized Accuracy: 0.4883720930232558
Optimized Cross-validation scores: [0.51162791 0.51162791 0.53488372 0.51162791 0.44186047] Optimized Cross-validation scores: [0.51162791 0.51162791 0.53488372 0.51162791 0.44186047]
Mean Optimized CV accuracy: 0.5023255813953489 Mean Optimized CV accuracy: 0.5023255813953489
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment