BASIC DATA VISUALIZATION¶

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
In [3]:
#line plot

x=np.arange(10)
y1=x**2 
y2=2*x+3
print(x)
print(y1)
print(y2)
[0 1 2 3 4 5 6 7 8 9]
[ 0  1  4  9 16 25 36 49 64 81]
[ 3  5  7  9 11 13 15 17 19 21]
In [4]:
themes=plt.style.available
print(themes)
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
In [5]:
plt.style.use("seaborn-dark-palette")
In [6]:
plt.plot(x,y1,color='red',label="Apple",marker="o")    # to plot 
#plt.show()    #  if both has to be printed on different graphs 
plt.plot(x,y2,color='green',label="Kiwi",linestyle="dashed")

plt.xlabel("Time")
plt.ylabel("Price")
plt.title("Prices of fruit over time ")
plt.legend()
plt.show()          # prints on the same graph
In [7]:
# agar x axis k liye data na bhi ho 
prices=np.array([1,2,3,4])**2
print(prices)
plt.plot(prices)
[ 1  4  9 16]
Out[7]:
[<matplotlib.lines.Line2D at 0x7f6cb4981690>]

SCATTER PLOTS¶

In [8]:
# SCATTER PLOTS 
plt.scatter(x,y1)
plt.show()
In [9]:
# adjust the size of any plot 
#plt.figure(figsize=(5,5))


plt.scatter(x,y1,color='red',label="Apple",marker="^")     
#plt.show()    
plt.scatter(x,y2,color='green',label="Kiwi",linestyle="dashed")

plt.xlabel("Time")
plt.ylabel("Price")
plt.title("Prices of fruit over time ")
plt.legend()
plt.show()         

BAR GRAPHS¶

In [10]:
plt.bar([0,1,2],[10,20,15]) #current year
plt.bar([0,1,2],[20,10,12])  #next year 
plt.show()

# this is overlapping , but we want side by side  
In [11]:
x_coor= np.array([0,1,2])*2
plt.bar(x_coor-0.25,[10,20,15],width=0.5,label="current year" , tick_label=["gold","silver","platinum"]) #current year
plt.bar(x_coor+0.25 ,[20,10,12],width=0.5,label="next year")  #next year 
plt.legend()
plt.ylim(0,40)   # for scaling
plt.xlim(-2,5)
plt.xlabel("Metal")
plt.ylabel("Price")
plt.title("Metal Price comparison")
plt.show()
In [12]:
#plt.style.use("dark_background")
x_coor= np.array([0,1,2])*2
plt.bar(x_coor-0.25,[10,20,15],width=0.5,label="current year" , tick_label=["gold","silver","platinum"]) #current year
plt.bar(x_coor+0.25 ,[20,10,12],width=0.5,label="next year",color="orange")  #next year 
plt.legend()
plt.xlabel("Metal")
plt.ylabel("Price")
plt.title("Metal Price comparison")
plt.show()

PIE CHART¶

In [13]:
subjects="ME","ITC","DCCN","DSP"
weightage= [10,20,15,5]
plt.pie(weightage,labels=subjects)
Out[13]:
([<matplotlib.patches.Wedge at 0x7f6cb4666b50>,
  <matplotlib.patches.Wedge at 0x7f6cb4666e10>,
  <matplotlib.patches.Wedge at 0x7f6cb45f24d0>,
  <matplotlib.patches.Wedge at 0x7f6cb45f2590>],
 [Text(0.8899186877588753, 0.6465637858537406, 'ME'),
  Text(-0.8899187180267095, 0.6465637441936395, 'ITC'),
  Text(1.0298943251329445e-07, -1.0999999999999954, 'DCCN'),
  Text(1.0461622140716127, -0.3399185517867209, 'DSP')])
In [14]:
subjects="ME","ITC","DCCN","DSP"
weightage= [10,20,15,5]
plt.pie(weightage,labels=subjects,explode=(1,0,0.1,0),autopct='%1.1f%%')
plt.show()

MOVIE DATA VISUALIZATION¶

In [15]:
import pandas as pd
In [16]:
pwd             # default directory
Out[16]:
'/kaggle/working'
In [17]:
df=pd.read_csv("movie_metadata.csv")
print(df.head(n=10))
   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0     178.0   
1  Color     Gore Verbinski                   302.0     169.0   
2  Color         Sam Mendes                   602.0     148.0   
3  Color  Christopher Nolan                   813.0     164.0   
4    NaN        Doug Walker                     NaN       NaN   
5  Color     Andrew Stanton                   462.0     132.0   
6  Color          Sam Raimi                   392.0     156.0   
7  Color       Nathan Greno                   324.0     100.0   
8  Color        Joss Whedon                   635.0     141.0   
9  Color        David Yates                   375.0     153.0   

   director_facebook_likes  actor_3_facebook_likes       actor_2_name  \
0                      0.0                   855.0   Joel David Moore   
1                    563.0                  1000.0      Orlando Bloom   
2                      0.0                   161.0       Rory Kinnear   
3                  22000.0                 23000.0     Christian Bale   
4                    131.0                     NaN         Rob Walker   
5                    475.0                   530.0    Samantha Morton   
6                      0.0                  4000.0       James Franco   
7                     15.0                   284.0       Donna Murphy   
8                      0.0                 19000.0  Robert Downey Jr.   
9                    282.0                 10000.0   Daniel Radcliffe   

   actor_1_facebook_likes        gross  \
0                  1000.0  760505847.0   
1                 40000.0  309404152.0   
2                 11000.0  200074175.0   
3                 27000.0  448130642.0   
4                   131.0          NaN   
5                   640.0   73058679.0   
6                 24000.0  336530303.0   
7                   799.0  200807262.0   
8                 26000.0  458991599.0   
9                 25000.0  301956980.0   

                                              genres  ...  \
0                    Action|Adventure|Fantasy|Sci-Fi  ...   
1                           Action|Adventure|Fantasy  ...   
2                          Action|Adventure|Thriller  ...   
3                                    Action|Thriller  ...   
4                                        Documentary  ...   
5                            Action|Adventure|Sci-Fi  ...   
6                           Action|Adventure|Romance  ...   
7  Adventure|Animation|Comedy|Family|Fantasy|Musi...  ...   
8                            Action|Adventure|Sci-Fi  ...   
9                   Adventure|Family|Fantasy|Mystery  ...   

  num_user_for_reviews language  country  content_rating       budget  \
0               3054.0  English      USA           PG-13  237000000.0   
1               1238.0  English      USA           PG-13  300000000.0   
2                994.0  English       UK           PG-13  245000000.0   
3               2701.0  English      USA           PG-13  250000000.0   
4                  NaN      NaN      NaN             NaN          NaN   
5                738.0  English      USA           PG-13  263700000.0   
6               1902.0  English      USA           PG-13  258000000.0   
7                387.0  English      USA              PG  260000000.0   
8               1117.0  English      USA           PG-13  250000000.0   
9                973.0  English       UK              PG  250000000.0   

   title_year actor_2_facebook_likes imdb_score  aspect_ratio  \
0      2009.0                  936.0        7.9          1.78   
1      2007.0                 5000.0        7.1          2.35   
2      2015.0                  393.0        6.8          2.35   
3      2012.0                23000.0        8.5          2.35   
4         NaN                   12.0        7.1           NaN   
5      2012.0                  632.0        6.6          2.35   
6      2007.0                11000.0        6.2          2.35   
7      2010.0                  553.0        7.8          1.85   
8      2015.0                21000.0        7.5          2.35   
9      2009.0                11000.0        7.5          2.35   

  movie_facebook_likes  
0                33000  
1                    0  
2                85000  
3               164000  
4                    0  
5                24000  
6                    0  
7                29000  
8               118000  
9                10000  

[10 rows x 28 columns]
In [18]:
df.columns
Out[18]:
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')
In [19]:
titles = list(df.get(('movie_title')))
In [20]:
print(titles[:5])
print(titles[0][:-1])       #special chars ko hatane k liye
['Avatar\xa0', "Pirates of the Caribbean: At World's End\xa0", 'Spectre\xa0', 'The Dark Knight Rises\xa0', 'Star Wars: Episode VII - The Force Awakens\xa0            ']
Avatar
In [21]:
freq_title={}
for x in titles:
    l=len(x)
    
    if freq_title.get(l) is None:
        freq_title[l]=1
                                       # if a particular length is coming for first time then we make it as 1
    else:
        freq_title[l]+=1
        
In [22]:
freq_title          # 138 movies in which the title length is 7 and so on
Out[22]:
{7: 138,
 41: 17,
 8: 226,
 22: 138,
 55: 3,
 12: 330,
 13: 306,
 24: 85,
 39: 13,
 35: 30,
 17: 239,
 18: 219,
 43: 8,
 16: 264,
 44: 9,
 15: 284,
 42: 14,
 23: 109,
 11: 328,
 36: 21,
 19: 213,
 10: 294,
 27: 69,
 20: 176,
 32: 43,
 26: 74,
 25: 82,
 14: 274,
 21: 138,
 31: 47,
 51: 5,
 6: 139,
 5: 91,
 63: 1,
 3: 8,
 9: 267,
 38: 24,
 28: 51,
 29: 65,
 34: 31,
 57: 1,
 47: 4,
 4: 35,
 37: 18,
 48: 4,
 46: 7,
 40: 14,
 45: 4,
 49: 4,
 50: 3,
 30: 36,
 33: 21,
 2: 5,
 59: 2,
 60: 1,
 56: 1,
 84: 1,
 54: 2,
 87: 1,
 53: 2,
 72: 1,
 69: 1,
 80: 1,
 67: 1}
In [23]:
print(freq_title)
{7: 138, 41: 17, 8: 226, 22: 138, 55: 3, 12: 330, 13: 306, 24: 85, 39: 13, 35: 30, 17: 239, 18: 219, 43: 8, 16: 264, 44: 9, 15: 284, 42: 14, 23: 109, 11: 328, 36: 21, 19: 213, 10: 294, 27: 69, 20: 176, 32: 43, 26: 74, 25: 82, 14: 274, 21: 138, 31: 47, 51: 5, 6: 139, 5: 91, 63: 1, 3: 8, 9: 267, 38: 24, 28: 51, 29: 65, 34: 31, 57: 1, 47: 4, 4: 35, 37: 18, 48: 4, 46: 7, 40: 14, 45: 4, 49: 4, 50: 3, 30: 36, 33: 21, 2: 5, 59: 2, 60: 1, 56: 1, 84: 1, 54: 2, 87: 1, 53: 2, 72: 1, 69: 1, 80: 1, 67: 1}
In [24]:
X=np.array(list(freq_title.keys()))
Y=np.array(list(freq_title.values()))
#print(X,Y)
In [25]:
plt.scatter(X,Y)
plt.xlabel("length of movie title")
plt.ylabel("no. of movies having this much long title ")
plt.title("movie data visualization problem")
plt.show()               # ye kuch gaussian sa aa rha hai