Skip to content
Snippets Groups Projects
Commit abb6fb21 authored by Denis Cvach's avatar Denis Cvach
Browse files

fixed missing imports

parent 7f625448
Branches
No related tags found
No related merge requests found
%% Cell type:code id:16b1a4e87c7a0fc tags:
``` python
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
```
%% Cell type:code id:initial_id tags:
``` python
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
```
%% Cell type:code id:106ef69516455c6a tags:
``` python
train.head()
```
%% Output
id belongs_to_collection budget \
0 1 [{'id': 313576, 'name': 'Hot Tub Time Machine ... 14000000
1 2 [{'id': 107674, 'name': 'The Princess Diaries ... 40000000
2 3 NaN 3300000
3 4 NaN 1200000
4 5 NaN 0
genres \
0 [{'id': 35, 'name': 'Comedy'}]
1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
2 [{'id': 18, 'name': 'Drama'}]
3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...
4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...
homepage imdb_id original_language \
0 NaN tt2637294 en
1 NaN tt0368933 en
2 http://sonyclassics.com/whiplash/ tt2582802 en
3 http://kahaanithefilm.com/ tt1821480 hi
4 NaN tt1380152 ko
original_title \
0 Hot Tub Time Machine 2
1 The Princess Diaries 2: Royal Engagement
2 Whiplash
3 Kahaani
4 마린보이
overview popularity ... \
0 When Lou, who has become the "father of the In... 6.575393 ...
1 Mia Thermopolis is now a college graduate and ... 8.248895 ...
2 Under the direction of a ruthless instructor, ... 64.299990 ...
3 Vidya Bagchi (Vidya Balan) arrives in Kolkata ... 3.174936 ...
4 Marine Boy is the story of a former national s... 1.148070 ...
release_date runtime spoken_languages \
0 2/20/15 93.0 [{'iso_639_1': 'en', 'name': 'English'}]
1 8/6/04 113.0 [{'iso_639_1': 'en', 'name': 'English'}]
2 10/10/14 105.0 [{'iso_639_1': 'en', 'name': 'English'}]
3 3/9/12 122.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
4 2/5/09 118.0 [{'iso_639_1': 'ko', 'name': '한국어/조선말'}]
status tagline \
0 Released The Laws of Space and Time are About to be Vio...
1 Released It can take a lifetime to find true love; she'...
2 Released The road to greatness can take you to the edge.
3 Released NaN
4 Released NaN
title \
0 Hot Tub Time Machine 2
1 The Princess Diaries 2: Royal Engagement
2 Whiplash
3 Kahaani
4 Marine Boy
Keywords \
0 [{'id': 4379, 'name': 'time travel'}, {'id': 9...
1 [{'id': 2505, 'name': 'coronation'}, {'id': 42...
2 [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...
3 [{'id': 10092, 'name': 'mystery'}, {'id': 1054...
4 NaN
cast \
0 [{'cast_id': 4, 'character': 'Lou', 'credit_id...
1 [{'cast_id': 1, 'character': 'Mia Thermopolis'...
2 [{'cast_id': 5, 'character': 'Andrew Neimann',...
3 [{'cast_id': 1, 'character': 'Vidya Bagchi', '...
4 [{'cast_id': 3, 'character': 'Chun-soo', 'cred...
crew revenue
0 [{'credit_id': '59ac067c92514107af02c8c8', 'de... 12314651
1 [{'credit_id': '52fe43fe9251416c7502563d', 'de... 95149435
2 [{'credit_id': '54d5356ec3a3683ba0000039', 'de... 13092000
3 [{'credit_id': '52fe48779251416c9108d6eb', 'de... 16000000
4 [{'credit_id': '52fe464b9251416c75073b43', 'de... 3923970
[5 rows x 23 columns]
%% Cell type:code id:e3b97dd2729e980e tags:
``` python
train.shape
```
%% Output
(3000, 23)
%% Cell type:code id:4c3a3221c61368eb tags:
``` python
train.describe()
```
%% Output
id budget popularity runtime revenue
count 3000.000000 3.000000e+03 3000.000000 2998.000000 3.000000e+03
mean 1500.500000 2.253133e+07 8.463274 107.856571 6.672585e+07
std 866.169729 3.702609e+07 12.104000 22.086434 1.375323e+08
min 1.000000 0.000000e+00 0.000001 0.000000 1.000000e+00
25% 750.750000 0.000000e+00 4.018053 94.000000 2.379808e+06
50% 1500.500000 8.000000e+06 7.374861 104.000000 1.680707e+07
75% 2250.250000 2.900000e+07 10.890983 118.000000 6.891920e+07
max 3000.000000 3.800000e+08 294.337037 338.000000 1.519558e+09
%% Cell type:code id:af54d94c tags:
``` python
# Drop unused columns
unused_cols = ['homepage', 'imdb_id', 'original_title', 'poster_path']
train.drop(columns=unused_cols, inplace=True)
test.drop(columns=unused_cols, inplace=True)
```
%% Cell type:code id:ad062ac5 tags:
``` python
# Cast budget and revenue to float
train['budget'] = train['budget'].astype(float)
train['revenue'] = train['revenue'].astype(float)
test['budget'] = test['budget'].astype(float)
```
%% Cell type:code id:512ae877fee04d0e tags:
``` python
train.isna().sum()
```
%% Output
id 0
belongs_to_collection 2396
budget 0
genres 7
original_language 0
overview 8
popularity 0
production_companies 156
production_countries 55
release_date 0
runtime 2
spoken_languages 20
status 0
tagline 597
title 0
Keywords 276
cast 13
crew 16
revenue 0
dtype: int64
%% Cell type:code id:4be3c0de600ed39c tags:
``` python
# Handle belongs_to_collection
for df in [train, test]:
df['belongs_to_collection'] = df['belongs_to_collection'].fillna('[]')
df['belongs_to_collection'] = df['belongs_to_collection'].apply(
lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
df['belongs_to_collection'] = df['belongs_to_collection'].apply(
lambda x: 1 if isinstance(x, list) and len(x) > 0 else 0
)
```
%% Cell type:code id:8498fca1 tags:
``` python
train.info()
```
%% Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3000 non-null int64
1 belongs_to_collection 3000 non-null int64
2 budget 3000 non-null float64
3 genres 2993 non-null object
4 original_language 3000 non-null object
5 overview 2992 non-null object
6 popularity 3000 non-null float64
7 production_companies 2844 non-null object
8 production_countries 2945 non-null object
9 release_date 3000 non-null object
10 runtime 2998 non-null float64
11 spoken_languages 2980 non-null object
12 status 3000 non-null object
13 tagline 2403 non-null object
14 title 3000 non-null object
15 Keywords 2724 non-null object
16 cast 2987 non-null object
17 crew 2984 non-null object
18 revenue 3000 non-null float64
dtypes: float64(4), int64(2), object(13)
memory usage: 445.4+ KB
%% Cell type:code id:e93f5299 tags:
``` python
train.head()
```
%% Output
id belongs_to_collection budget \
0 1 1 14000000.0
1 2 1 40000000.0
2 3 0 3300000.0
3 4 0 1200000.0
4 5 0 0.0
genres original_language \
0 [{'id': 35, 'name': 'Comedy'}] en
1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... en
2 [{'id': 18, 'name': 'Drama'}] en
3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n... hi
4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam... ko
overview popularity \
0 When Lou, who has become the "father of the In... 6.575393
1 Mia Thermopolis is now a college graduate and ... 8.248895
2 Under the direction of a ruthless instructor, ... 64.299990
3 Vidya Bagchi (Vidya Balan) arrives in Kolkata ... 3.174936
4 Marine Boy is the story of a former national s... 1.148070
production_companies \
0 [{'name': 'Paramount Pictures', 'id': 4}, {'na...
1 [{'name': 'Walt Disney Pictures', 'id': 2}]
2 [{'name': 'Bold Films', 'id': 2266}, {'name': ...
3 NaN
4 NaN
production_countries release_date runtime \
0 [{'iso_3166_1': 'US', 'name': 'United States o... 2/20/15 93.0
1 [{'iso_3166_1': 'US', 'name': 'United States o... 8/6/04 113.0
2 [{'iso_3166_1': 'US', 'name': 'United States o... 10/10/14 105.0
3 [{'iso_3166_1': 'IN', 'name': 'India'}] 3/9/12 122.0
4 [{'iso_3166_1': 'KR', 'name': 'South Korea'}] 2/5/09 118.0
spoken_languages status \
0 [{'iso_639_1': 'en', 'name': 'English'}] Released
1 [{'iso_639_1': 'en', 'name': 'English'}] Released
2 [{'iso_639_1': 'en', 'name': 'English'}] Released
3 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released
4 [{'iso_639_1': 'ko', 'name': '한국어/조선말'}] Released
tagline \
0 The Laws of Space and Time are About to be Vio...
1 It can take a lifetime to find true love; she'...
2 The road to greatness can take you to the edge.
3 NaN
4 NaN
title \
0 Hot Tub Time Machine 2
1 The Princess Diaries 2: Royal Engagement
2 Whiplash
3 Kahaani
4 Marine Boy
Keywords \
0 [{'id': 4379, 'name': 'time travel'}, {'id': 9...
1 [{'id': 2505, 'name': 'coronation'}, {'id': 42...
2 [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...
3 [{'id': 10092, 'name': 'mystery'}, {'id': 1054...
4 NaN
cast \
0 [{'cast_id': 4, 'character': 'Lou', 'credit_id...
1 [{'cast_id': 1, 'character': 'Mia Thermopolis'...
2 [{'cast_id': 5, 'character': 'Andrew Neimann',...
3 [{'cast_id': 1, 'character': 'Vidya Bagchi', '...
4 [{'cast_id': 3, 'character': 'Chun-soo', 'cred...
crew revenue
0 [{'credit_id': '59ac067c92514107af02c8c8', 'de... 12314651.0
1 [{'credit_id': '52fe43fe9251416c7502563d', 'de... 95149435.0
2 [{'credit_id': '54d5356ec3a3683ba0000039', 'de... 13092000.0
3 [{'credit_id': '52fe48779251416c9108d6eb', 'de... 16000000.0
4 [{'credit_id': '52fe464b9251416c75073b43', 'de... 3923970.0
%% Cell type:code id:06434f30 tags:
``` python
# Step 1: Replace NaN with the median runtime
median_runtime = train['runtime'].median()
train['runtime'] = train['runtime'].fillna(median_runtime)
# Step 2: Define bins and labels for runtime categories
bins = [0, 60, 90, 120, 150, 180, 210, np.inf]
labels = [1, 2, 3, 4, 5, 6, 7]
# Step 3: Categorize runtime into bins
train['runtime'] = pd.cut(train['runtime'], bins=bins, labels=labels, right=False)
# Analyze counts
runtime_counts = train['runtime'].value_counts().sort_index()
# Display results
print(runtime_counts)
# Step 4: Plot revenue for different runtime categories
plt.figure(figsize=(8, 6))
sns.boxplot(x='runtime', y='revenue', data=train)
plt.title('Revenue Distribution for Different Runtime Categories')
plt.xlabel('Runtime Category')
plt.ylabel('Revenue')
plt.show()
```
%% Output
runtime
1 16
2 397
3 1882
4 570
5 106
6 22
7 7
Name: count, dtype: int64
%% Cell type:code id:38ef1154 tags:
``` python
train.info()
```
%% Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3000 non-null int64
1 belongs_to_collection 3000 non-null int64
2 budget 3000 non-null float64
3 genres 2993 non-null object
4 original_language 3000 non-null object
5 overview 2992 non-null object
6 popularity 3000 non-null float64
7 production_companies 2844 non-null object
8 production_countries 2945 non-null object
9 release_date 3000 non-null object
10 runtime 3000 non-null category
11 spoken_languages 2980 non-null object
12 status 3000 non-null object
13 tagline 2403 non-null object
14 title 3000 non-null object
15 Keywords 2724 non-null object
16 cast 2987 non-null object
17 crew 2984 non-null object
18 revenue 3000 non-null float64
dtypes: category(1), float64(3), int64(2), object(13)
memory usage: 425.3+ KB
%% Cell type:code id:df7d1aae tags:
``` python
# Analyze counts of movies by original language
language_counts = train['original_language'].value_counts()
# Calculate the average revenue by original language
average_revenue_by_language = train.groupby('original_language')['revenue'].mean()
# Plot the average revenue by original language
plt.figure(figsize=(12, 8))
sns.barplot(x=average_revenue_by_language.index, y=average_revenue_by_language.values)
plt.title('Average Revenue by Original Language')
plt.xlabel('Original Language')
plt.ylabel('Average Revenue')
plt.xticks(rotation=90)
plt.show()
```
%% Output
%% Cell type:code id:56432d39 tags:
``` python
# Handle original_language
all_languages = pd.concat([train['original_language'], test['original_language']]).unique()
encoder = LabelEncoder()
encoder.fit(all_languages)
train['original_language'] = encoder.transform(train['original_language'])
test['original_language'] = encoder.transform(test['original_language'])
```
%% Cell type:code id:c99cf1ee tags:
``` python
# Handle release_date and extract Year
for df in [train, test]:
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d', errors='coerce')
df['Year'] = df['release_date'].dt.year
df.drop(columns=['release_date'], inplace=True)
if 'Year' in df.columns:
df['Year'] = df['Year'].fillna(0)
```
%% Output
C:\Users\Denis\AppData\Local\Temp\ipykernel_23124\3171800086.py:4: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
C:\Users\Denis\AppData\Local\Temp\ipykernel_23124\3171800086.py:4: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
%% Cell type:code id:b15ee976 tags:
``` python
# Increase weight of 'budget' and 'Year' before defining X
train['budget'] = train['budget'] * 10
train['Year'] = train['Year'] * 100
test['budget'] = test['budget'] * 10
test['Year'] = test['Year'] * 100
# Define features and target
features = ['budget', 'Year', 'runtime', 'belongs_to_collection', 'original_language']
X = train[features]
y = train['revenue']
# Prepare test data
X_test = test[features]
```
%% Cell type:code id:fd7a8143 tags:
``` python
# Split train data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
# Evaluate model
y_valid_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_valid_pred)
print(f"Validation Mean Squared Error: {mse}")
# Predict on test data
test_predictions = model.predict(X_test)
# Create submission file
submission = pd.DataFrame({'id': test['id'], 'revenue': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created as 'submission.csv'.")
```
%% Output
Validation Mean Squared Error: 9433624811236546.0
Validation Mean Squared Error: 7865794974968504.0
Submission file created as 'submission.csv'.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment