import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#line plot
x=np.arange(10)
y1=x**2
y2=2*x+3
print(x)
print(y1)
print(y2)
[0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] [ 3 5 7 9 11 13 15 17 19 21]
themes=plt.style.available
print(themes)
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
plt.style.use("seaborn-dark-palette")
plt.plot(x,y1,color='red',label="Apple",marker="o") # to plot
#plt.show() # if both has to be printed on different graphs
plt.plot(x,y2,color='green',label="Kiwi",linestyle="dashed")
plt.xlabel("Time")
plt.ylabel("Price")
plt.title("Prices of fruit over time ")
plt.legend()
plt.show() # prints on the same graph
# agar x axis k liye data na bhi ho
prices=np.array([1,2,3,4])**2
print(prices)
plt.plot(prices)
[ 1 4 9 16]
[<matplotlib.lines.Line2D at 0x7f6cb4981690>]
# SCATTER PLOTS
plt.scatter(x,y1)
plt.show()
# adjust the size of any plot
#plt.figure(figsize=(5,5))
plt.scatter(x,y1,color='red',label="Apple",marker="^")
#plt.show()
plt.scatter(x,y2,color='green',label="Kiwi",linestyle="dashed")
plt.xlabel("Time")
plt.ylabel("Price")
plt.title("Prices of fruit over time ")
plt.legend()
plt.show()
plt.bar([0,1,2],[10,20,15]) #current year
plt.bar([0,1,2],[20,10,12]) #next year
plt.show()
# this is overlapping , but we want side by side
x_coor= np.array([0,1,2])*2
plt.bar(x_coor-0.25,[10,20,15],width=0.5,label="current year" , tick_label=["gold","silver","platinum"]) #current year
plt.bar(x_coor+0.25 ,[20,10,12],width=0.5,label="next year") #next year
plt.legend()
plt.ylim(0,40) # for scaling
plt.xlim(-2,5)
plt.xlabel("Metal")
plt.ylabel("Price")
plt.title("Metal Price comparison")
plt.show()
#plt.style.use("dark_background")
x_coor= np.array([0,1,2])*2
plt.bar(x_coor-0.25,[10,20,15],width=0.5,label="current year" , tick_label=["gold","silver","platinum"]) #current year
plt.bar(x_coor+0.25 ,[20,10,12],width=0.5,label="next year",color="orange") #next year
plt.legend()
plt.xlabel("Metal")
plt.ylabel("Price")
plt.title("Metal Price comparison")
plt.show()
subjects="ME","ITC","DCCN","DSP"
weightage= [10,20,15,5]
plt.pie(weightage,labels=subjects)
([<matplotlib.patches.Wedge at 0x7f6cb4666b50>, <matplotlib.patches.Wedge at 0x7f6cb4666e10>, <matplotlib.patches.Wedge at 0x7f6cb45f24d0>, <matplotlib.patches.Wedge at 0x7f6cb45f2590>], [Text(0.8899186877588753, 0.6465637858537406, 'ME'), Text(-0.8899187180267095, 0.6465637441936395, 'ITC'), Text(1.0298943251329445e-07, -1.0999999999999954, 'DCCN'), Text(1.0461622140716127, -0.3399185517867209, 'DSP')])
subjects="ME","ITC","DCCN","DSP"
weightage= [10,20,15,5]
plt.pie(weightage,labels=subjects,explode=(1,0,0.1,0),autopct='%1.1f%%')
plt.show()
import pandas as pd
pwd # default directory
'/kaggle/working'
df=pd.read_csv("movie_metadata.csv")
print(df.head(n=10))
color director_name num_critic_for_reviews duration \
0 Color James Cameron 723.0 178.0
1 Color Gore Verbinski 302.0 169.0
2 Color Sam Mendes 602.0 148.0
3 Color Christopher Nolan 813.0 164.0
4 NaN Doug Walker NaN NaN
5 Color Andrew Stanton 462.0 132.0
6 Color Sam Raimi 392.0 156.0
7 Color Nathan Greno 324.0 100.0
8 Color Joss Whedon 635.0 141.0
9 Color David Yates 375.0 153.0
director_facebook_likes actor_3_facebook_likes actor_2_name \
0 0.0 855.0 Joel David Moore
1 563.0 1000.0 Orlando Bloom
2 0.0 161.0 Rory Kinnear
3 22000.0 23000.0 Christian Bale
4 131.0 NaN Rob Walker
5 475.0 530.0 Samantha Morton
6 0.0 4000.0 James Franco
7 15.0 284.0 Donna Murphy
8 0.0 19000.0 Robert Downey Jr.
9 282.0 10000.0 Daniel Radcliffe
actor_1_facebook_likes gross \
0 1000.0 760505847.0
1 40000.0 309404152.0
2 11000.0 200074175.0
3 27000.0 448130642.0
4 131.0 NaN
5 640.0 73058679.0
6 24000.0 336530303.0
7 799.0 200807262.0
8 26000.0 458991599.0
9 25000.0 301956980.0
genres ... \
0 Action|Adventure|Fantasy|Sci-Fi ...
1 Action|Adventure|Fantasy ...
2 Action|Adventure|Thriller ...
3 Action|Thriller ...
4 Documentary ...
5 Action|Adventure|Sci-Fi ...
6 Action|Adventure|Romance ...
7 Adventure|Animation|Comedy|Family|Fantasy|Musi... ...
8 Action|Adventure|Sci-Fi ...
9 Adventure|Family|Fantasy|Mystery ...
num_user_for_reviews language country content_rating budget \
0 3054.0 English USA PG-13 237000000.0
1 1238.0 English USA PG-13 300000000.0
2 994.0 English UK PG-13 245000000.0
3 2701.0 English USA PG-13 250000000.0
4 NaN NaN NaN NaN NaN
5 738.0 English USA PG-13 263700000.0
6 1902.0 English USA PG-13 258000000.0
7 387.0 English USA PG 260000000.0
8 1117.0 English USA PG-13 250000000.0
9 973.0 English UK PG 250000000.0
title_year actor_2_facebook_likes imdb_score aspect_ratio \
0 2009.0 936.0 7.9 1.78
1 2007.0 5000.0 7.1 2.35
2 2015.0 393.0 6.8 2.35
3 2012.0 23000.0 8.5 2.35
4 NaN 12.0 7.1 NaN
5 2012.0 632.0 6.6 2.35
6 2007.0 11000.0 6.2 2.35
7 2010.0 553.0 7.8 1.85
8 2015.0 21000.0 7.5 2.35
9 2009.0 11000.0 7.5 2.35
movie_facebook_likes
0 33000
1 0
2 85000
3 164000
4 0
5 24000
6 0
7 29000
8 118000
9 10000
[10 rows x 28 columns]
df.columns
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
dtype='object')
titles = list(df.get(('movie_title')))
print(titles[:5])
print(titles[0][:-1]) #special chars ko hatane k liye
['Avatar\xa0', "Pirates of the Caribbean: At World's End\xa0", 'Spectre\xa0', 'The Dark Knight Rises\xa0', 'Star Wars: Episode VII - The Force Awakens\xa0 '] Avatar
freq_title={}
for x in titles:
l=len(x)
if freq_title.get(l) is None:
freq_title[l]=1
# if a particular length is coming for first time then we make it as 1
else:
freq_title[l]+=1
freq_title # 138 movies in which the title length is 7 and so on
{7: 138,
41: 17,
8: 226,
22: 138,
55: 3,
12: 330,
13: 306,
24: 85,
39: 13,
35: 30,
17: 239,
18: 219,
43: 8,
16: 264,
44: 9,
15: 284,
42: 14,
23: 109,
11: 328,
36: 21,
19: 213,
10: 294,
27: 69,
20: 176,
32: 43,
26: 74,
25: 82,
14: 274,
21: 138,
31: 47,
51: 5,
6: 139,
5: 91,
63: 1,
3: 8,
9: 267,
38: 24,
28: 51,
29: 65,
34: 31,
57: 1,
47: 4,
4: 35,
37: 18,
48: 4,
46: 7,
40: 14,
45: 4,
49: 4,
50: 3,
30: 36,
33: 21,
2: 5,
59: 2,
60: 1,
56: 1,
84: 1,
54: 2,
87: 1,
53: 2,
72: 1,
69: 1,
80: 1,
67: 1}
print(freq_title)
{7: 138, 41: 17, 8: 226, 22: 138, 55: 3, 12: 330, 13: 306, 24: 85, 39: 13, 35: 30, 17: 239, 18: 219, 43: 8, 16: 264, 44: 9, 15: 284, 42: 14, 23: 109, 11: 328, 36: 21, 19: 213, 10: 294, 27: 69, 20: 176, 32: 43, 26: 74, 25: 82, 14: 274, 21: 138, 31: 47, 51: 5, 6: 139, 5: 91, 63: 1, 3: 8, 9: 267, 38: 24, 28: 51, 29: 65, 34: 31, 57: 1, 47: 4, 4: 35, 37: 18, 48: 4, 46: 7, 40: 14, 45: 4, 49: 4, 50: 3, 30: 36, 33: 21, 2: 5, 59: 2, 60: 1, 56: 1, 84: 1, 54: 2, 87: 1, 53: 2, 72: 1, 69: 1, 80: 1, 67: 1}
X=np.array(list(freq_title.keys()))
Y=np.array(list(freq_title.values()))
#print(X,Y)
plt.scatter(X,Y)
plt.xlabel("length of movie title")
plt.ylabel("no. of movies having this much long title ")
plt.title("movie data visualization problem")
plt.show() # ye kuch gaussian sa aa rha hai