Python basic data visualization

Load Packages

python

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='white', color_codes=True)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

Import csv from Local Machine

python

df=pd.read_csv('gapminder.csv')

#df=pd.read_csv('c:\\Users\\faisal\\Desktop\\Python\\Lesson-2\\gapminder.csv')
#df=pd.read_csv(r'c:\Users\faisal\Desktop\Python\Lesson-2\gapminder.csv')

python

df.head()

country  year  infant_mortality  life_expectancy  fertility  \
0              Albania  1960            115.40            62.87       6.19
1              Algeria  1960            148.20            47.50       7.65
2               Angola  1960            208.00            35.98       7.32
3  Antigua and Barbuda  1960               NaN            62.97       4.43
4            Argentina  1960             59.87            65.39       3.11

   population           gdp continent           region
0   1636054.0           NaN    Europe  Southern Europe
1  11124892.0  1.382815e+10    Africa  Northern Africa
2   5270844.0           NaN    Africa    Middle Africa
3     54681.0           NaN  Americas        Caribbean
4  20619075.0  1.083220e+11  Americas    South America

python

df.info()

text

<class 'pandas.core.frame.DataFrame'>
    RangeIndex: 10545 entries, 0 to 10544
    Data columns (total 9 columns):
    country             10545 non-null object
    year                10545 non-null int64
    infant_mortality    9092 non-null float64
    life_expectancy     10545 non-null float64
    fertility           10358 non-null float64
    population          10360 non-null float64
    gdp                 7573 non-null float64
    continent           10545 non-null object
    region              10545 non-null object
    dtypes: float64(5), int64(1), object(3)
    memory usage: 741.5+ KB

python

df['country']=df.country.astype('category')
df['continent']=df.continent.astype('category')
df['region']=df.region.astype('category')

python

df.info()

text

<class 'pandas.core.frame.DataFrame'>
    RangeIndex: 10545 entries, 0 to 10544
    Data columns (total 9 columns):
    country             10545 non-null category
    year                10545 non-null int64
    infant_mortality    9092 non-null float64
    life_expectancy     10545 non-null float64
    fertility           10358 non-null float64
    population          10360 non-null float64
    gdp                 7573 non-null float64
    continent           10545 non-null category
    region              10545 non-null category
    dtypes: category(3), float64(5), int64(1)
    memory usage: 543.0 KB

python

#Distribution
d = sns.distplot(df['life_expectancy'])

python

d = sns.distplot(df['life_expectancy'],bins=10)

python

#Scatterplot:
#plt.figure(figsize=(10,7)) # if you want to resize your plot
df2=df[df.year == 1962] # Filter the dataset only for specific year
s=sns.scatterplot( data=df2,x='fertility',y='life_expectancy',
                  hue='continent', #color based on continent
                                )
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left") # legend Position
plt.ylabel("Life Expentency") # y label
plt.xlabel("Fertility") # x label

text

Text(0.5, 0, 'Fertility')

python

#Linear Model plot:
df2=df[df.year == 1962] # Filter the dataset only for specific year
l=sns.lmplot(data=df2,x='fertility',y='life_expectancy',size=7,aspect=1,scatter_kws={"s":100})
plt.ylabel("Life Expentency") # y label
plt.xlabel("Fertility") # x label

text

Text(0.5, 8.96000000000003, 'Fertility')

python

#Boxplot:
plt.figure(figsize=(10,7)) # if you want to resize your plot
bp = sns.boxplot(data=df, x='continent', y= 'life_expectancy')

python

#jointplot:
df2=df[df.year == 1962] # Filter the dataset only for specific year
j = sns.jointplot(data=df2, x='fertility',y='life_expectancy', color='g')

python

j = sns.jointplot(data=df2, x='fertility',y='life_expectancy', color='g', kind='reg')

python

j = sns.jointplot(data=df2, x='fertility',y='life_expectancy', color='g', kind='hex')

sns.set(style='white', color_codes=True)

python

#histogram
plt.hist(df['life_expectancy'], bins=5)

text

(array([   6.,  264., 1984., 3936., 4355.]),
     array([13.2 , 27.34, 41.48, 55.62, 69.76, 83.9 ]),
     <a list of 5 Patch objects>)

python

df.continent.cat.categories

Index(['Africa', 'Americas', 'Asia', 'Europe', 'Oceania'], dtype='object')

python

df.continent.unique() #SELECT DISTINCT COLUMN_NAME FROM TABLE

text

[Europe, Africa, Americas, Asia, Oceania]
    Categories (5, object): [Europe, Africa, Americas, Asia, Oceania]

python

#Stacked Histogram
plt.figure(figsize=(10,7))
plt.hist([df[df.continent=='Africa'].life_expectancy,
         df[df.continent=='Americas'].life_expectancy,
         df[df.continent=='Asia'].life_expectancy,
         df[df.continent=='Europe'].life_expectancy,
         df[df.continent=='Oceania'].life_expectancy],
         bins=15,
         label=['Africa', 'Americas', 'Asia', 'Europe', 'Oceania'],
         stacked=True
)
plt.legend()
plt.show()