import pandas as pd
= (
url "https://www.dropbox.com/scl/fi/"
"nnlww9mk1vmevn1lytywr/flights_ontime.csv"
"?rlkey=iska1a863ezg640lvd86wgoky&dl=1"
)
# 1) read FL_DATE and pull times in as strings
= ["ARR_TIME", "DEP_TIME", "CRS_DEP_TIME", "CRS_ARR_TIME"]
time_cols
= (
df
pd.read_csv(
url,=["FL_DATE"],
parse_dates=lambda x: pd.to_datetime(x, format="%m/%d/%Y %I:%M:%S %p"),
date_format={col: str for col in time_cols}, # force them to string
dtype
)=time_cols) # drop rows missing any of the times
.dropna(subset
)
# 2) define a one-liner to zero-pad & parse “hhmm”
def hhmm_to_time(col):
return (
pd.to_datetime(str.zfill(4), # “1”→“0001”, “59”→“0059”, “1323”→“1323”
col.format="%H%M",
="coerce" # invalid → NaT
errors
)# extract python datetime.time
.dt.time
)
# 3) apply it to all of them in one go
= df[time_cols].apply(hhmm_to_time) df[time_cols]
Assignment 04: Hypothesis Testing and Bootstrapping
Hypothesis testing with flights data
This is the least structured problem you have received to date. The goal is for you to explore the given dataset and apply hypothesis testing techniques to turn vague questions into concrete statistical tests.
Below is code to import a dataset of flights across the US in January 2025.
Think about how you might use this data to design hypothesis tests and answer the following questions: 1. Does the time of day affect flight delays? 2. Do flights from different airlines have different delay patterns? (Hint: is there a difference in how long flights are delayed, or how often they are delayed?)