pandas
¶This is a quick tour of basic plotting with matplotlib. For more detail see
You can install pandas on your own machine (e.g. python3 -m pip install pandas
). It is also available on Google Colab.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# Prettification options
# Nice colors
plt.style.use("Solarize_Light2")
# Big text options
params = {'axes.labelsize': 18,
'axes.titlesize':32,
'font.size': 20,
'legend.fontsize': 20,
'xtick.labelsize': 16,
'ytick.labelsize': 16}
# Global params like default sizes live in mpl not plt
mpl.rcParams.update(params)
df = pd.read_excel("data/animals.xlsx")
df
# In columns of type float, pandas uses "NaN" (not a number)
# to indicate missing data. In this case, the worldwide
# population of capybaras is not known.
# read_html takes a URL, string, or urllib response object
# and gives a list of DataFrames, one for each table in the
# document.
# Here, we ask for just the *first* table
# (corresponding to summer 2019)
pd.read_html(
"https://catalog.uic.edu/ucat/academic-calendar/"
)[0]
This is the example we'll work with for the rest of lecture.
df = pd.read_csv(
"data/europe-electricity/europe-electricity-2017-to-2019.csv",
parse_dates=True,
index_col="date" # refers to name in header row
)
type(df)
# DataFrame = table
df["France"] # one column of the table
type(df["France"])
# Series = column
pd.Timestamp("2018-04-28")
# DataFrame[column_name][index]
df["United Kingdom"][pd.Timestamp("2018-04-28")]
t0 = pd.Timestamp("2018-04-28")
t1 = t0 + pd.Timedelta(days=7)
df["United Kingdom"][t0:t1]
# Gotcha: in pandas, slices in the index include end!
df.loc[t0:t1] # all the rows from t0 to t1
# all rows where Italy < 30k and Germany > 50k
df.loc[(df["Italy"] < 30000.0) & (df["Germany"] > 50000.0)]
import json
with open("data/europe-electricity/europe-pop-millions-2019.json") as infile:
pop = json.load(infile)
pop
# Convert original dataframe to one that has per capita
# consumption in each column
# Since this modifies the data, let's make sure we start
# with the original (total MW) data by reloading.
df = pd.read_csv(
"data/europe-electricity/europe-electricity-2017-to-2019.csv",
parse_dates=True,
index_col="date" # refers to name in header row
)
for c in ["Italy","Germany","France","United Kingdom"]:
df[c] /= pop[c]
# This shows dataframes are MUTABLE
# Now df contains figures in units of MW/Mperson
# i.e. W per capita
df
# Aside: It's very common to use dataframe mutations to "fix"
# columns. E.g. if a column contains temperature strings like
# "43.1C" you could use
df["temperature"] = np.array([ x[:-1] for x in df["temperature"]]).astype("float")
# to replace it with a column of floats
# You *can* use pandas series as the x and y arrays for
# a matplotlib plot.
plt.plot(df.index,df["France"])
# Here, df.index is the column of dates, the x axis
# df["France"] is the the y axis
ax = df.plot(figsize=(15,8)) # uses plt.plot behind the scenes
ax.set_title("Electrical energy consumption 2017-2019",pad=20)
ax.set_ylabel("W per capita")
ax = df.rolling(7).mean().plot(figsize=(15,8)) # uses plt.plot behind the scenes
ax.set_title("Electrical energy consumption 2017-2019 (7-day moving average)",size=22,pad=20)
ax.set_ylabel("W per capita")