33"""
44
55
6- def gapminder ():
6+ def gapminder (datetimes = False , centroids = False , year = None ):
77 """
8- Each row represents a country on a given year.
8+ Each row represents a country on a given year.
99
10- https://www.gapminder.org/data/
10+ https://www.gapminder.org/data/
1111
12- Returns:
13- A `pandas.DataFrame` with 1704 rows and the following columns:
14- `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
15- 'iso_alpha', 'iso_num']`.
16- """
17- return _get_dataset ("gapminder" )
12+ Returns:
13+ A `pandas.DataFrame` with 1704 rows and the following columns:
14+ `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
15+ 'iso_alpha', 'iso_num']`.
16+ If `datetimes` is True, the 'year' column will be a datetime column
17+ If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
18+ If `year` is an integer, the dataset will be filtered for that year
19+ """
20+ df = _get_dataset ("gapminder" )
21+ if datetimes :
22+ df ["year" ] = (df ["year" ].astype (str ) + "-01-01" ).astype ("datetime64[ns]" )
23+ if not centroids :
24+ df .drop (["centroid_lat" , "centroid_lon" ], axis = 1 , inplace = True )
25+ if year :
26+ df = df .query ("year == %d" % year )
27+ return df
1828
1929
2030def tips ():
2131 """
22- Each row represents a restaurant bill.
32+ Each row represents a restaurant bill.
2333
24- https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
34+ https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
2535
26- Returns:
27- A `pandas.DataFrame` with 244 rows and the following columns:
28- `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
29- """
36+ Returns:
37+ A `pandas.DataFrame` with 244 rows and the following columns:
38+ `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`."""
3039 return _get_dataset ("tips" )
3140
3241
3342def iris ():
3443 """
35- Each row represents a flower.
44+ Each row represents a flower.
3645
37- https://en.wikipedia.org/wiki/Iris_flower_data_set
46+ https://en.wikipedia.org/wiki/Iris_flower_data_set
3847
39- Returns:
40- A `pandas.DataFrame` with 150 rows and the following columns:
41- `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
42- """
48+ Returns:
49+ A `pandas.DataFrame` with 150 rows and the following columns:
50+ `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`."""
4351 return _get_dataset ("iris" )
4452
4553
4654def wind ():
4755 """
48- Each row represents a level of wind intensity in a cardinal direction, and its frequency.
56+ Each row represents a level of wind intensity in a cardinal direction, and its frequency.
4957
50- Returns:
51- A `pandas.DataFrame` with 128 rows and the following columns:
52- `['direction', 'strength', 'frequency']`.
53- """
58+ Returns:
59+ A `pandas.DataFrame` with 128 rows and the following columns:
60+ `['direction', 'strength', 'frequency']`."""
5461 return _get_dataset ("wind" )
5562
5663
5764def election ():
5865 """
59- Each row represents voting results for an electoral district in the 2013 Montreal
60- mayoral election.
66+ Each row represents voting results for an electoral district in the 2013 Montreal
67+ mayoral election.
6168
62- Returns:
63- A `pandas.DataFrame` with 58 rows and the following columns:
64- `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
65- """
69+ Returns:
70+ A `pandas.DataFrame` with 58 rows and the following columns:
71+ `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`."""
6672 return _get_dataset ("election" )
6773
6874
6975def election_geojson ():
7076 """
71- Each feature represents an electoral district in the 2013 Montreal mayoral election.
77+ Each feature represents an electoral district in the 2013 Montreal mayoral election.
7278
73- Returns:
74- A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
75- is an electoral district numerical ID and whose `district` property is the ID and
76- district name.
77- """
79+ Returns:
80+ A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
81+ is an electoral district numerical ID and whose `district` property is the ID and
82+ district name."""
7883 import gzip
7984 import json
8085 import os
@@ -92,27 +97,28 @@ def election_geojson():
9297
9398def carshare ():
9499 """
95- Each row represents the availability of car-sharing services near the centroid of a zone
96- in Montreal over a month-long period.
100+ Each row represents the availability of car-sharing services near the centroid of a zone
101+ in Montreal over a month-long period.
97102
98- Returns:
99- A `pandas.DataFrame` with 249 rows and the following columns:
100- `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
101- """
103+ Returns:
104+ A `pandas.DataFrame` with 249 rows and the following columns:
105+ `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`."""
102106 return _get_dataset ("carshare" )
103107
104108
105- def stocks (indexed = False ):
109+ def stocks (indexed = False , datetimes = False ):
106110 """
107- Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
108-
109- Returns:
110- A `pandas.DataFrame` with 100 rows and the following columns:
111- `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
112- If `indexed` is True, the 'date' column is used as the index and the column index
113- is named 'company'
114- """
111+ Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
112+
113+ Returns:
114+ A `pandas.DataFrame` with 100 rows and the following columns:
115+ `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
116+ If `indexed` is True, the 'date' column is used as the index and the column index
117+ If `datetimes` is True, the 'date' column will be a datetime column
118+ is named 'company' """
115119 df = _get_dataset ("stocks" )
120+ if datetimes :
121+ df ["date" ] = df ["date" ].astype ("datetime64[ns]" )
116122 if indexed :
117123 df = df .set_index ("date" )
118124 df .columns .name = "company"
@@ -121,15 +127,14 @@ def stocks(indexed=False):
121127
122128def experiment (indexed = False ):
123129 """
124- Each row in this wide dataset represents the results of 100 simulated participants
125- on three hypothetical experiments, along with their gender and control/treatment group.
130+ Each row in this wide dataset represents the results of 100 simulated participants
131+ on three hypothetical experiments, along with their gender and control/treatment group.
126132
127133
128- Returns:
129- A `pandas.DataFrame` with 100 rows and the following columns:
130- `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
131- If `indexed` is True, the data frame index is named "participant"
132- """
134+ Returns:
135+ A `pandas.DataFrame` with 100 rows and the following columns:
136+ `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
137+ If `indexed` is True, the data frame index is named "participant" """
133138 df = _get_dataset ("experiment" )
134139 if indexed :
135140 df .index .name = "participant"
@@ -138,15 +143,14 @@ def experiment(indexed=False):
138143
139144def medals_wide (indexed = False ):
140145 """
141- This dataset represents the medal table for Olympic Short Track Speed Skating for the
142- top three nations as of 2020.
143-
144- Returns:
145- A `pandas.DataFrame` with 3 rows and the following columns:
146- `['nation', 'gold', 'silver', 'bronze']`.
147- If `indexed` is True, the 'nation' column is used as the index and the column index
148- is named 'medal'
149- """
146+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
147+ top three nations as of 2020.
148+
149+ Returns:
150+ A `pandas.DataFrame` with 3 rows and the following columns:
151+ `['nation', 'gold', 'silver', 'bronze']`.
152+ If `indexed` is True, the 'nation' column is used as the index and the column index
153+ is named 'medal'"""
150154 df = _get_dataset ("medals" )
151155 if indexed :
152156 df = df .set_index ("nation" )
@@ -156,14 +160,13 @@ def medals_wide(indexed=False):
156160
157161def medals_long (indexed = False ):
158162 """
159- This dataset represents the medal table for Olympic Short Track Speed Skating for the
160- top three nations as of 2020.
163+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
164+ top three nations as of 2020.
161165
162- Returns:
163- A `pandas.DataFrame` with 9 rows and the following columns:
164- `['nation', 'medal', 'count']`.
165- If `indexed` is True, the 'nation' column is used as the index.
166- """
166+ Returns:
167+ A `pandas.DataFrame` with 9 rows and the following columns:
168+ `['nation', 'medal', 'count']`.
169+ If `indexed` is True, the 'nation' column is used as the index."""
167170 df = _get_dataset ("medals" ).melt (
168171 id_vars = ["nation" ], value_name = "count" , var_name = "medal"
169172 )
0 commit comments