In this article, we look at a world population dataset and we will analyze and show a couple of python examples that we can use with this dataset
You can download the dataset from the link at the bottom of the article, here are the columns
CCA3: 3 Digit Country/Territory Code
Country/Territory: Name of the Country/Territory
Capital: Name of the Capital
Continent: Name of the Continent
2022 Population: Population of the Country/Territories in the year 2022
2020 Population: Population of the Country/Territories in the year 2020
2015 Population: Population of the Country/Territories in the year 2015
2010 Population: Population of the Country/Territories in the year 2010
2000 Population: Population of the Country/Territories in the year 2000
1990 Population: Population of the Country/Territories in the year 1990
1980 Population: Population of the Country/Territories in the year 1980
1970 Population: Population of the Country/Territories in the year 1970
Area (km²): Area size of the Country/Territories in square kilometers
Density (per km²): Population Density per square kilometer
Growth Rate: Population Growth Rate by Country/Territories
World Population Percentage: The population percentage by each Country/Territories
As you can see there are a lot of things we can do with a dataset like top ten most populous countries, you can do that by continent
Code Examples
Let’s do some basic analysis and load our dataset
We will check for missing values or duplicates
# Checking if there any missing values are in the data
print(df.isnull().sum())
# checking the duplicates
print(df.duplicated().sum())
This is a very good dataset and there were no missing values or duplicates
# imports import pandas as pd import numpy as np # Data vislization using plolty graph object(go) import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import iplot import plotly.io as pio # For showing plotly plots on notebook import plotly.offline as py from plotly.offline import init_notebook_mode #py.init_notebook_mode() df = pd.read_csv('world_population.csv') #data analysis print(df.head()) print(df.shape) # Checking if there any missing values are in the data print(df.isnull().sum()) # checking the duplicates print(df.duplicated().sum())
Top countries by population
Now lets look at the highest countries by population numbers in 2022
# imports import pandas as pd import numpy as np # Data vislization using plolty graph object(go) import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import iplot import plotly.io as pio # For showing plotly plots on notebook import plotly.offline as py from plotly.offline import init_notebook_mode #py.init_notebook_mode() df = pd.read_csv('world_population.csv') #data analysis print(df.head()) print(df.shape) # Checking if there any missing values are in the data print(df.isnull().sum()) # checking the duplicates print(df.duplicated().sum()) colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"] top_pop = df.sort_values(by = '2022 Population', ascending = False).head(10) print(top_pop[['Country/Territory', '2022 Population']]) data = go.Bar(x = top_pop['Country/Territory'], y = top_pop['2022 Population'], text = top_pop['2022 Population'],textposition ='outside', textfont = dict(size = 30, color = 'black'), marker = dict(color = colors, opacity = 0.7, line_color = 'black', line_width = 2)) layout = go.Layout(title = {'text': "<b>Top 10 Countries with highest population</b>", 'x':0.5, 'xanchor': 'center'}, xaxis = dict(title='Countries' ), yaxis =dict(title='Populations'), width = 900, height = 600, template = 'plotly_white') fig=go.Figure(data = data, layout = layout) iplot(fig)
Smallest countries by population
Now lets look at the smallest countries by population numbers in 2022
lowest_pop = df.sort_values(by = ‘2022 Population’, ascending = True).head(10)
# imports import pandas as pd import numpy as np # Data vislization using plolty graph object(go) import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import iplot import plotly.io as pio # For showing plotly plots on notebook import plotly.offline as py from plotly.offline import init_notebook_mode #py.init_notebook_mode() df = pd.read_csv('world_population.csv') #data analysis print(df.head()) print(df.shape) # Checking if there any missing values are in the data print(df.isnull().sum()) # checking the duplicates print(df.duplicated().sum()) colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"] lowest_pop = df.sort_values(by = '2022 Population', ascending = True).head(10) print(lowest_pop[['Country/Territory', '2022 Population']]) data = go.Bar(x = lowest_pop['Country/Territory'], y = lowest_pop['2022 Population'],text = lowest_pop['2022 Population'],textposition ='outside', textfont = dict(size = 10, color = 'black'), marker = dict(color = colors, opacity = 0.7, line_color = 'black', line_width = 2)) layout = go.Layout(title = {'text' : '<b>Top 10 Countries with the lowest population</b>', 'x' : 0.5}, xaxis = dict(title = '<b>Countries</b>'), yaxis = dict(title = '<b>Population</b>'), width = 900, height = 700, template = 'plotly_white') lowfig = go.Figure(data = data, layout = layout) lowfig.update_xaxes(tickangle=90,tickfont_size = 12) iplot(lowfig)
largest European countries by population
# imports import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt # Data vislization using plolty graph object(go) import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import iplot import plotly.io as pio import plotly.express as px # For showing plotly plots on notebook import plotly.offline as py from plotly.offline import init_notebook_mode #py.init_notebook_mode() df = pd.read_csv('world_population.csv') #data analysis print(df.head()) print(df.shape) # Checking if there any missing values are in the data print(df.isnull().sum()) # checking the duplicates print(df.duplicated().sum()) colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"] sns.set(rc={"axes.facecolor":"#F2EAC5","figure.facecolor":"#F2EAC5"}) plt.subplots(figsize=(20, 10)) p=sns.barplot(data=df[df["Continent"]=="Europe"],y="Country/Territory", x="2022 Population",order=df[df["Continent"]=="Europe"].sort_values("2022 Population",ascending=False)["Country/Territory"][:11],palette=colors[0:11:2], saturation=1,edgecolor = "#1c1c1c", linewidth = 4) p.axes.set_title("\nEuropean Population 2022\n",fontsize=25) p.axes.set_xlabel("Population",fontsize=20) p.axes.set_ylabel("\nCountry",fontsize=20) p.axes.set_xticklabels(p.get_xticklabels(),rotation = 90) for container in p.containers: p.bar_label(container,label_type="edge",padding=6,size=25,color="black",rotation=0, bbox={"boxstyle": "round", "pad": 0.4, "facecolor": "orange", "edgecolor": "#1c1c1c", "linewidth" : 2, "alpha": 1}) sns.despine(left=True, bottom=True) plt.show()
Continents by percentage
# imports import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt # Data vislization using plolty graph object(go) import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import iplot import plotly.io as pio import plotly.express as px # For showing plotly plots on notebook import plotly.offline as py from plotly.offline import init_notebook_mode #py.init_notebook_mode() df = pd.read_csv('world_population.csv') #data analysis print(df.head()) print(df.shape) # Checking if there any missing values are in the data print(df.isnull().sum()) # checking the duplicates print(df.duplicated().sum()) colors = ["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a41623","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"] cont_pop = df.groupby('Continent',)[['World Population Percentage']].sum().sort_values(by = 'World Population Percentage', ascending = False) cont_pop fig = go.Figure(data = go.Pie(labels = cont_pop.index, values = cont_pop['World Population Percentage'].values)) fig.update_traces(hoverinfo='label', hole = 0.4, textfont_size = 18, textposition ='auto', marker=dict(colors = colors, line = dict(color = 'white', width = 2))) fig.update_layout(title ={'text' : '<b>Continent Population Percentage</b>', 'x' : 0.21}, template = 'xgridoff', width = 900, height = 600, legend=dict( title_font_family="Times New Roman", font=dict( family="Courier", size=20, color="black" ), bgcolor="white", bordercolor="Black", borderwidth=2.5) ) iplot(fig)
Links
If you want the dataset and the code examples they are available from
https://github.com/programmershelp/maxpython/tree/main/Data%20Analysis/worldpopulation