python-in-quarto

Author

shitao

以下测试代码来自这里

Some content …

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

Data import and basic manipulation

df = pd.read_csv('aiddata.csv')
df.head()
aiddata_id aiddata_2_id year donor recipient commitment_amount_usd_constant coalesced_purpose_code coalesced_purpose_name
0 2414478.0 NaN 1977 Saudi Arabia India 348718518.0 23030 Power generation/renewable sources
1 2414509.0 NaN 1977 Saudi Arabia Brazil 191647004.0 23040 Electrical transmission/ distribution
2 2414635.0 NaN 1983 Saudi Arabia India 79371799.0 21030 Rail transport
3 2414665.0 NaN 1984 Saudi Arabia Taiwan 212202942.0 21030 Rail transport
4 2414667.0 NaN 1984 Saudi Arabia Korea 134511154.0 21040 Water transport
# don't actually need the first two columns:
df = df.iloc[:, 2:]

# rename the columns
df.columns = ['year', 'donor', 'recipient', 'amount', 'purpose_code', 'purpose_name']
df.head()
year donor recipient amount purpose_code purpose_name
0 1977 Saudi Arabia India 348718518.0 23030 Power generation/renewable sources
1 1977 Saudi Arabia Brazil 191647004.0 23040 Electrical transmission/ distribution
2 1983 Saudi Arabia India 79371799.0 21030 Rail transport
3 1984 Saudi Arabia Taiwan 212202942.0 21030 Rail transport
4 1984 Saudi Arabia Korea 134511154.0 21040 Water transport
# check the shape
df.shape

# close to 10K rows!
(98540, 6)
# check year range

min(df.year), max(df.year)
(1973, 2013)

Task 1

Some content …

# We first get the donation data
donation_data = []
for group in df.groupby(['year', 'donor']):
    # total amount of donation in that year for this country
    total_yearly_donation = sum(group[1].amount)
    # year, country name, total
    donation_data.append(
        (group[0][0], group[0][1], total_yearly_donation))
donation_df = pd.DataFrame(
    donation_data, columns=['year','country','donation'])
donation_df.head()
year country donation
0 1973 Australia 46285863.0
1 1973 Belgium 39251336.0
2 1973 Canada 437928427.0
3 1973 France 247189555.0
4 1973 Germany 562232384.0
# Then we get the receiving data
receiving_data = []
for group in df.groupby(['year', 'recipient']):
    # total amount of receiving in that year for this country
    total_yearly_receiving = sum(group[1].amount)
    # year, country name, total
    receiving_data.append(
        (group[0][0], group[0][1], total_yearly_receiving))
receiving_df = pd.DataFrame(
    receiving_data, columns=['year','country','receiving'])
receiving_df.head()
year country receiving
0 1973 Brazil 3.120750e+08
1 1973 Chile 8.805608e+07
2 1973 Colombia 5.499448e+08
3 1973 Cyprus 9.613414e+06
4 1973 India 2.285257e+09
all_cntry = list(df.donor) + list(df.recipient)
all_cntry = list(set(all_cntry))

# there are in total 47 unique countries
len(all_cntry)
47
# We have so many countries. It's difficult to visualize them all in one plot
# So I am assigning groups to countries. 
# Basically, I want to plot 4 countires in each figure
cntry_group_list = list(np.arange(1,13)) * 4
del cntry_group_list[-1]
receiving_df[receiving_df.year == 1973]
year country receiving
0 1973 Brazil 3.120750e+08
1 1973 Chile 8.805608e+07
2 1973 Colombia 5.499448e+08
3 1973 Cyprus 9.613414e+06
4 1973 India 2.285257e+09
5 1973 Korea 1.363707e+09
6 1973 Kuwait 3.254830e+05
7 1973 Saudi Arabia 6.509700e+04
8 1973 Thailand 2.063634e+08
9 1973 United Arab Emirates 6.509700e+04
# same issue for donation data
donation_df[donation_df.year == 1973]
year country donation
0 1973 Australia 4.628586e+07
1 1973 Belgium 3.925134e+07
2 1973 Canada 4.379284e+08
3 1973 France 2.471896e+08
4 1973 Germany 5.622324e+08
5 1973 Italy 1.667191e+08
6 1973 Japan 9.389659e+08
7 1973 Netherlands 1.627509e+08
8 1973 Norway 3.587485e+07
9 1973 Sweden 1.683693e+08
10 1973 Switzerland 1.406094e+07
11 1973 United Kingdom 4.425792e+08
12 1973 United States 1.553264e+09
donation_dfs = []
for group in donation_df.groupby('year'):
    year = group[0]
    present_cntry = group[1].country.tolist()
    absent_cntry = [x for x in all_cntry if x not in present_cntry]
    absent_df = pd.DataFrame({
        'year': year,
        'country': absent_cntry,
        'donation': 0
    })
    dff = pd.concat([group[1], absent_df], ignore_index = True)
    
    dff.sort_values(by='country', ascending=True, inplace=True)
    dff['group'] = cntry_group_list
    donation_dfs.append(dff)
donation = pd.concat(donation_dfs, ignore_index = True)
donation.head()
year country donation group
0 1973 Australia 46285863.0 1
1 1973 Austria 0.0 2
2 1973 Belgium 39251336.0 3
3 1973 Brazil 0.0 4
4 1973 Canada 437928427.0 5
receiving_dfs = []
for group in receiving_df.groupby('year'):
    year = group[0]
    present_cntry = group[1].country.tolist()
    absent_cntry = [x for x in all_cntry if x not in present_cntry]
    absent_df = pd.DataFrame({
        'year': year,
        'country': absent_cntry,
        'receiving': 0
    })
    dff = pd.concat([group[1], absent_df], ignore_index = True)
    dff.sort_values(by='country', ascending=True, inplace=True)
    dff['group'] = cntry_group_list
    receiving_dfs.append(dff)
receiving = pd.concat(receiving_dfs, ignore_index = True)
receiving.head()
year country receiving group
0 1973 Australia 0.0 1
1 1973 Austria 0.0 2
2 1973 Belgium 0.0 3
3 1973 Brazil 312075045.0 4
4 1973 Canada 0.0 5
# to check whether the countries in the two lists are the same
r_c = list(set(receiving.country))
d_c = list(set(donation.country))
r_c == d_c
True
all_df = donation
all_df['receiving'] = receiving['receiving']
all_df['d_minus_r'] = all_df['donation'] - all_df['receiving']
all_df.head()
year country donation group receiving d_minus_r
0 1973 Australia 46285863.0 1 0.0 46285863.0
1 1973 Austria 0.0 2 0.0 0.0
2 1973 Belgium 39251336.0 3 0.0 39251336.0
3 1973 Brazil 0.0 4 312075045.0 -312075045.0
4 1973 Canada 437928427.0 5 0.0 437928427.0
all_df['year'] = pd.to_datetime(all_df['year'], format='%Y')
# We only plot group 1, for simplicity
group1 = all_df[all_df.group == 1]
plt.figure()
alt.Chart(group1).mark_line().encode(
    x='year:T',
    y = alt.Y(
        'd_minus_r:Q',
        title = 'Donation minus receiving'
    ),
    color='country:N',
    strokeDash='country:N'
)
plt.show()
<Figure size 672x480 with 0 Axes>
Altair plot render

Quarto 貌似渲染不出 altair 画的图。日后琢磨。

Using Python Visualization Libraries in RStudio

Matplotlib

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

mu = 200
sigma = 25
x = np.random.normal(mu, sigma, size=100)

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(8, 4))

ax0.hist(x, 20, density=1, histtype='stepfilled', facecolor='g', alpha=0.75)
ax0.set_title('stepfilled')

# Create a histogram by providing the bin edges (unequally spaced).
bins = [100, 150, 180, 195, 205, 220, 250, 300]
ax1.hist(x, bins, density=1, histtype='bar', rwidth=0.8)
ax1.set_title('unequal bins')
fig.tight_layout()
plt.show()

Pandas

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dd = pd.DataFrame(np.random.randn(10, 10)).applymap(abs)
dd = dd.cumsum()

plt.figure()
dd.plot.bar(colormap='Greens')
plt.show()
<Figure size 672x480 with 0 Axes>

Seaborn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid", palette="muted")

# Load the example iris dataset
iris = sns.load_dataset("iris")

# "Melt" the dataset to "long-form" or "tidy" representation
iris = pd.melt(iris, "species", var_name="measurement")
plt.figure()

# Draw a categorical scatterplot to show each observation
sns.swarmplot(x="measurement", y="value", hue="species", palette=["r", "c", "y"], data=iris)
plt.show()