Vega datasets

from vega_datasets import data
import pandas as pd
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)
for dataset in data.list_datasets():
    print(dataset)
7zip
airports
annual-precip
anscombe
barley
birdstrikes
budget
budgets
burtin
cars
climate
co2-concentration
countries
crimea
disasters
driving
earthquakes
ffox
flare
flare-dependencies
flights-10k
flights-200k
flights-20k
flights-2k
flights-3m
flights-5k
flights-airport
gapminder
gapminder-health-income
gimp
github
graticule
income
iowa-electricity
iris
jobs
la-riots
londonBoroughs
londonCentroids
londonTubeLines
lookup_groups
lookup_people
miserables
monarchs
movies
normal-2d
obesity
ohlc
points
population
population_engineers_hurricanes
seattle-temps
seattle-weather
sf-temps
sp500
stocks
udistrict
unemployment
unemployment-across-industries
uniform-2d
us-10m
us-employment
us-state-capitals
volcano
weather
weball26
wheat
windvectors
world-110m
zipcodes
import pandas as pd
from vega_datasets import data
from itables import show
import textwrap

dataset_descriptions = {
    "7zip": "Application icon from the open-source 7-Zip software project.",
    "airports": "Airport data, source not specified in the original description.",
    "annual-precip": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell, from CFSv2.",
    "anscombe": "Dataset from 'Graphs in Statistical Analysis' by F. J. Anscombe, The American Statistician.",
    "barley": "Results from a 1930s agricultural experiment in Minnesota, containing yields for 10 different varieties of barley at six different sites.",
    "birdstrikes": "Bird strike data from http://wildlife.faa.gov",
    "budget": "U.S. Budget data for FY 2016 from the Office of Management and Budget.",
    "burtin": "Dataset based on Will Burtin's 1951 visualization of antibiotic effectiveness, comparing three antibiotics against 16 bacteria.",
    "cars": "Car statistics dataset from http://lib.stat.cmu.edu/datasets/",
    "countries": "Demographic indicators (life expectancy and fertility rate) for various countries from 1955 to 2000 at 5-year intervals.",
    "crimea": "Data related to Florence Nightingale's famous 'Coxcomb' chart about mortality in the Crimean War.",
    "disasters": "Natural disaster data from https://ourworldindata.org/natural-catastrophes",
    "driving": "Driving statistics, possibly related to a New York Times article from May 2, 2010.",
    "earthquakes": "Earthquake data from USGS, captured on Feb 6, 2018.",
    "ffox": "Application icon from the open-source Firefox software project.",
    "flare": "Hierarchical data, possibly related to software architecture or dependencies.",
    "flights-10k": "Flight delay statistics from U.S. Bureau of Transportation Statistics, subset of 10,000 records.",
    "gapminder": "Demographic indicators (life expectancy, population, and fertility rate) for various countries from 1955 to 2005 at 5-year intervals.",
    "gimp": "Application icon from the open-source GIMP software project.",
    "github": "GitHub-related data, generated using a custom script.",
    "iowa-electricity": "Annual net generation of electricity in Iowa by source, compiled by the U.S. Energy Information Administration.",
    "jobs": "U.S. census data on occupations by sex and year across decades between 1850 and 2000.",
    "miserables": "Co-occurrence of characters in Victor Hugo's Les Misérables.",
    "monarchs": "Chronological list of English and British monarchs from Elizabeth I through George IV.",
    "movies": "Movie dataset with intentionally included errors for instructional purposes.",
    "population": "United States population statistics by sex and age group across decades between 1850 and 2000.",
    "seattle-weather": "Daily weather records for Seattle with metric units, synthesized from NOAA data.",
    "sp500": "S&P 500 index values from 2000 to 2020, retrieved from Yahoo Finance.",
    "unemployment": "County-level unemployment rates in the United States, generally consistent with 2009 levels.",
    "us-10m": "U.S. map data at 1:10 million scale.",
    "volcano": "Topographic information for Maunga Whau (Mt Eden) volcano in Auckland, New Zealand.",
    "weather": "Instructional dataset showing actual and predicted temperature data.",
    "wheat": "250 years of wheat prices alongside weekly wages and reigning monarchs, based on William Playfair's 1822 chart.",
    "zipcodes": "Zipcode data from GeoNames.org",
}


def get_dataset_info(dataset_name):
    try:
        df = data(dataset_name)
        if len(df) > 100:
            df = df.head(100)
        return {
            "name": dataset_name,
            "df": df,
            "rows": len(df),
            "columns": len(df.columns),
            "size_kb": df.memory_usage(deep=True).sum() / 1024,
            "description": dataset_descriptions.get(
                dataset_name, "No description available."
            ),
        }
    except Exception as e:
        return None


dataset_names = [
    "7zip",
    "airports",
    "annual-precip",
    "anscombe",
    "barley",
    "birdstrikes",
    "budget",
    "budgets",
    "burtin",
    "cars",
    "climate",
    "co2-concentration",
    "countries",
    "crimea",
    "disasters",
    "driving",
    "earthquakes",
    "ffox",
    "flare",
    "flare-dependencies",
    "flights-10k",
    "flights-20k",
    "flights-2k",
    "flights-5k",
    "flights-airport",
    "gapminder",
    "gapminder-health-income",
    "gimp",
    "github",
    "graticule",
    "income",
    "iowa-electricity",
    "iris",
    "jobs",
    "la-riots",
    "londonBoroughs",
    "londonCentroids",
    "londonTubeLines",
    "lookup_groups",
    "lookup_people",
    "miserables",
    "monarchs",
    "movies",
    "normal-2d",
    "obesity",
    "ohlc",
    "points",
    "population",
    "population_engineers_hurricanes",
    "seattle-temps",
    "seattle-weather",
    "sf-temps",
    "sp500",
    "stocks",
    "udistrict",
    "unemployment",
    "unemployment-across-industries",
    "uniform-2d",
    "us-10m",
    "us-employment",
    "us-state-capitals",
    "volcano",
    "weather",
    "weball26",
    "wheat",
    "windvectors",
    "world-110m",
    "zipcodes",
]

dataset_info = [get_dataset_info(name) for name in dataset_names]
dataset_info = [
    info for info in dataset_info if info is not None and info["size_kb"] > 0
]

for info in dataset_info:
    print(f"\n\n## {info['name']}")
    print(
        f"Rows: {info['rows']}, Columns: {info['columns']}, Size: {info['size_kb']:.2f} KB"
    )
    print("\n**Description:**")
    print(textwrap.fill(info["description"], width=80))
    show(info["df"], caption=f"Top 100 rows of {info['name']} dataset")
    print("\n")


## airports
Rows: 100, Columns: 7, Size: 28.76 KB

**Description:**
Airport data, source not specified in the original description.
Top 100 rows of airports dataset
iata name city state country latitude longitude
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## anscombe
Rows: 44, Columns: 3, Size: 3.01 KB

**Description:**
Dataset from 'Graphs in Statistical Analysis' by F. J. Anscombe, The American
Statistician.
Top 100 rows of anscombe dataset
Series X Y
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## barley
Rows: 100, Columns: 4, Size: 12.88 KB

**Description:**
Results from a 1930s agricultural experiment in Minnesota, containing yields for
10 different varieties of barley at six different sites.
Top 100 rows of barley dataset
yield variety year site
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## birdstrikes
Rows: 100, Columns: 14, Size: 61.23 KB

**Description:**
Bird strike data from http://wildlife.faa.gov
Top 100 rows of birdstrikes dataset
Airport__Name Aircraft__Make_Model Effect__Amount_of_damage Flight_Date Aircraft__Airline_Operator Origin_State When__Phase_of_flight Wildlife__Size Wildlife__Species When__Time_of_day Cost__Other Cost__Repair Cost__Total_$ Speed_IAS_in_knots
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## budget
Rows: 100, Columns: 72, Size: 359.72 KB

**Description:**
U.S. Budget data for FY 2016 from the Office of Management and Budget.
Top 100 rows of budget dataset
Source Category Code Source category name Source subcategory Source subcategory name Agency code Agency name Bureau code Bureau name Account code Account name Treasury Agency code On- or off-budget 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 TQ 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## budgets
Rows: 100, Columns: 3, Size: 2.47 KB

**Description:**
No description available.
Top 100 rows of budgets dataset
budgetYear forecastYear value
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## burtin
Rows: 16, Columns: 6, Size: 3.39 KB

**Description:**
Dataset based on Will Burtin's 1951 visualization of antibiotic effectiveness,
comparing three antibiotics against 16 bacteria.
Top 100 rows of burtin dataset
Bacteria Penicillin Streptomycin Neomycin Gram_Staining Genus
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## cars
Rows: 100, Columns: 9, Size: 17.21 KB

**Description:**
Car statistics dataset from http://lib.stat.cmu.edu/datasets/
Top 100 rows of cars dataset
Name Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration Year Origin
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## climate
Rows: 100, Columns: 9, Size: 19.95 KB

**Description:**
No description available.
Top 100 rows of climate dataset
STATION STATION_NAME ELEVATION LATITUDE LONGITUDE DATE HLY-TEMP-NORMAL HLY-PRES-NORMAL HLY-DEWP-NORMAL
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## co2-concentration
Rows: 100, Columns: 2, Size: 6.67 KB

**Description:**
No description available.
Top 100 rows of co2-concentration dataset
Date CO2
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## countries
Rows: 100, Columns: 9, Size: 14.33 KB

**Description:**
Demographic indicators (life expectancy and fertility rate) for various
countries from 1955 to 2000 at 5-year intervals.
Top 100 rows of countries dataset
_comment year fertility life_expect n_fertility n_life_expect country p_fertility p_life_expect
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## crimea
Rows: 24, Columns: 4, Size: 0.88 KB

**Description:**
Data related to Florence Nightingale's famous 'Coxcomb' chart about mortality in
the Crimean War.
Top 100 rows of crimea dataset
date wounds other disease
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## disasters
Rows: 100, Columns: 3, Size: 8.53 KB

**Description:**
Natural disaster data from https://ourworldindata.org/natural-catastrophes
Top 100 rows of disasters dataset
Entity Year Deaths
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## driving
Rows: 55, Columns: 4, Size: 4.31 KB

**Description:**
Driving statistics, possibly related to a New York Times article from May 2,
2010.
Top 100 rows of driving dataset
side year miles gas
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flare
Rows: 100, Columns: 4, Size: 8.21 KB

**Description:**
Hierarchical data, possibly related to software architecture or dependencies.
Top 100 rows of flare dataset
id name parent size
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flare-dependencies
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of flare-dependencies dataset
source target
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flights-10k
Rows: 100, Columns: 5, Size: 12.63 KB

**Description:**
Flight delay statistics from U.S. Bureau of Transportation Statistics, subset of
10,000 records.
Top 100 rows of flights-10k dataset
date delay distance origin destination
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flights-20k
Rows: 100, Columns: 5, Size: 12.63 KB

**Description:**
No description available.
Top 100 rows of flights-20k dataset
date delay distance origin destination
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flights-2k
Rows: 100, Columns: 5, Size: 12.63 KB

**Description:**
No description available.
Top 100 rows of flights-2k dataset
date delay distance origin destination
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flights-5k
Rows: 100, Columns: 5, Size: 12.63 KB

**Description:**
No description available.
Top 100 rows of flights-5k dataset
date delay distance origin destination
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## flights-airport
Rows: 100, Columns: 3, Size: 11.07 KB

**Description:**
No description available.
Top 100 rows of flights-airport dataset
origin destination count
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## gapminder
Rows: 100, Columns: 6, Size: 9.61 KB

**Description:**
Demographic indicators (life expectancy, population, and fertility rate) for
various countries from 1955 to 2005 at 5-year intervals.
Top 100 rows of gapminder dataset
year country cluster pop life_expect fertility
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## gapminder-health-income
Rows: 100, Columns: 4, Size: 8.05 KB

**Description:**
No description available.
Top 100 rows of gapminder-health-income dataset
country income health population
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## github
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
GitHub-related data, generated using a custom script.
Top 100 rows of github dataset
time count
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## income
Rows: 100, Columns: 6, Size: 19.47 KB

**Description:**
No description available.
Top 100 rows of income dataset
name region id pct total group
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## iowa-electricity
Rows: 51, Columns: 3, Size: 3.96 KB

**Description:**
Annual net generation of electricity in Iowa by source, compiled by the U.S.
Energy Information Administration.
Top 100 rows of iowa-electricity dataset
year source net_generation
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## iris
Rows: 100, Columns: 5, Size: 8.82 KB

**Description:**
No description available.
Top 100 rows of iris dataset
sepalLength sepalWidth petalLength petalWidth species
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## jobs
Rows: 100, Columns: 5, Size: 13.70 KB

**Description:**
U.S. census data on occupations by sex and year across decades between 1850 and
2000.
Top 100 rows of jobs dataset
job sex year count perc
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## la-riots
Rows: 63, Columns: 11, Size: 27.62 KB

**Description:**
No description available.
Top 100 rows of la-riots dataset
first_name last_name age gender race death_date address neighborhood type longitude latitude
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## londonCentroids
Rows: 33, Columns: 3, Size: 2.55 KB

**Description:**
No description available.
Top 100 rows of londonCentroids dataset
name cx cy
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## lookup_groups
Rows: 9, Columns: 2, Size: 0.67 KB

**Description:**
No description available.
Top 100 rows of lookup_groups dataset
group person
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## lookup_people
Rows: 9, Columns: 3, Size: 0.74 KB

**Description:**
No description available.
Top 100 rows of lookup_people dataset
name age height
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## monarchs
Rows: 12, Columns: 5, Size: 1.17 KB

**Description:**
Chronological list of English and British monarchs from Elizabeth I through
George IV.
Top 100 rows of monarchs dataset
name start end index commonwealth
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## movies
Rows: 100, Columns: 16, Size: 46.73 KB

**Description:**
Movie dataset with intentionally included errors for instructional purposes.
Top 100 rows of movies dataset
Title US_Gross Worldwide_Gross US_DVD_Sales Production_Budget Release_Date MPAA_Rating Running_Time_min Distributor Source Major_Genre Creative_Type Director Rotten_Tomatoes_Rating IMDB_Rating IMDB_Votes
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## normal-2d
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of normal-2d dataset
u v
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## obesity
Rows: 50, Columns: 3, Size: 3.40 KB

**Description:**
No description available.
Top 100 rows of obesity dataset
id rate state
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## ohlc
Rows: 44, Columns: 7, Size: 4.50 KB

**Description:**
No description available.
Top 100 rows of ohlc dataset
date open high low close signal ret
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## points
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of points dataset
x y
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## population
Rows: 100, Columns: 4, Size: 3.25 KB

**Description:**
United States population statistics by sex and age group across decades between
1850 and 2000.
Top 100 rows of population dataset
year age sex people
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## population_engineers_hurricanes
Rows: 52, Columns: 5, Size: 4.68 KB

**Description:**
No description available.
Top 100 rows of population_engineers_hurricanes dataset
state id population engineers hurricanes
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## seattle-temps
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of seattle-temps dataset
date temp
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## seattle-weather
Rows: 100, Columns: 6, Size: 9.20 KB

**Description:**
Daily weather records for Seattle with metric units, synthesized from NOAA data.
Top 100 rows of seattle-weather dataset
date precipitation temp_max temp_min wind weather
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## sf-temps
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of sf-temps dataset
temp date
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## sp500
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
S&P 500 index values from 2000 to 2020, retrieved from Yahoo Finance.
Top 100 rows of sp500 dataset
date price
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## stocks
Rows: 100, Columns: 3, Size: 6.87 KB

**Description:**
No description available.
Top 100 rows of stocks dataset
symbol date price
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## udistrict
Rows: 100, Columns: 2, Size: 6.39 KB

**Description:**
No description available.
Top 100 rows of udistrict dataset
key lat
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## unemployment
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
County-level unemployment rates in the United States, generally consistent with
2009 levels.
Top 100 rows of unemployment dataset
id rate
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## unemployment-across-industries
Rows: 100, Columns: 6, Size: 9.80 KB

**Description:**
No description available.
Top 100 rows of unemployment-across-industries dataset
series year month count rate date
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## uniform-2d
Rows: 100, Columns: 2, Size: 1.69 KB

**Description:**
No description available.
Top 100 rows of uniform-2d dataset
u v
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## us-employment
Rows: 100, Columns: 24, Size: 23.86 KB

**Description:**
No description available.
Top 100 rows of us-employment dataset
month nonfarm private goods_producing service_providing private_service_providing mining_and_logging construction manufacturing durable_goods nondurable_goods trade_transportation_utilties wholesale_trade retail_trade transportation_and_warehousing utilities information financial_activities professional_and_business_services education_and_health_services leisure_and_hospitality other_services government nonfarm_change
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## us-state-capitals
Rows: 50, Columns: 4, Size: 6.52 KB

**Description:**
No description available.
Top 100 rows of us-state-capitals dataset
lon lat state city
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## volcano
Rows: 100, Columns: 3, Size: 2.47 KB

**Description:**
Topographic information for Maunga Whau (Mt Eden) volcano in Auckland, New
Zealand.
Top 100 rows of volcano dataset
width height values
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## weather
Rows: 10, Columns: 6, Size: 6.63 KB

**Description:**
Instructional dataset showing actual and predicted temperature data.
Top 100 rows of weather dataset
day record normal actual id forecast
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## weball26
Rows: 58, Columns: 25, Size: 27.84 KB

**Description:**
No description available.
Top 100 rows of weball26 dataset
Candidate_Identification Candidate_Name Incumbent_Challenger_Status Party_Code Party_Affiliation Total_Receipts Transfers_from_Authorized_Committees Total_Disbursements Transfers_to_Authorized_Committees Beginning_Cash Ending_Cash Contributions_from_Candidate Loans_from_Candidate Other_Loans Candidate_Loan_Repayments Other_Loan_Repayments Debts_Owed_By Total_Individual_Contributions Candidate_State Candidate_District Contributions_from_Other_Political_Committees Contributions_from_Party_Committees Coverage_End_Date Refunds_to_Individuals Refunds_to_Committees
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## wheat
Rows: 52, Columns: 3, Size: 1.35 KB

**Description:**
250 years of wheat prices alongside weekly wages and reigning monarchs, based on
William Playfair's 1822 chart.
Top 100 rows of wheat dataset
year wheat wages
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## windvectors
Rows: 100, Columns: 5, Size: 4.04 KB

**Description:**
No description available.
Top 100 rows of windvectors dataset
longitude latitude dir dirCat speed
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)




## zipcodes
Rows: 100, Columns: 6, Size: 22.99 KB

**Description:**
Zipcode data from GeoNames.org
Top 100 rows of zipcodes dataset
zip_code latitude longitude city state county
Loading ITables v2.2.2 from the init_notebook_mode cell... (need help?)