# # Create data folder
# os.makedirs("data/original", exist_ok=True)
# zip_path = os.path.join("data/original", "njmin.zip")
# codebook_path = "data/original/codebook"
# data_path = "data/original/public.dat"
# # Download from zip from source and unzip
# url = "http://davidcard.berkeley.edu/data_sets/njmin.zip"
# r = requests.get(url)
# with open(zip_path, "wb") as f:
# f.write(r.content)
# with zipfile.ZipFile(zip_path, "r") as zf:
# zf.extractall("data/original")
# z = zipfile.ZipFile(io.BytesIO(r.content))
# # Read the codebook ( legacy encoding )
# with open(codebook_path, "r", encoding="latin1") as f:
# codebook_text = f.read()
# pattern = re.compile(r"^([A-Z0-9_]+)\s+(\d+)\s+(\d+)\s+\d+\.\d+", re.MULTILINE)
# colspecs = []
# names = []
# for match in pattern.finditer(codebook_text):
# varname = match.group(1).lower()
# start = int(match.group(2)) - 1 # convert to 0-based index for pandas
# end = int(match.group(3))
# names.append(varname)
# colspecs.append((start, end))
# # Read the data file
# df = pd.read_fwf(data_path, colspecs=colspecs, names=names)
# df = df.apply(pd.to_numeric, errors="coerce")
# df.to_csv("data/card-and-kruegar-1993.csv", index=False)
# df.sample(5)
# # Create data folder
# os.makedirs("data/original", exist_ok=True)
# zip_path = os.path.join("data/original", "njmin.zip")
# codebook_path = "data/original/codebook"
# data_path = "data/original/public.dat"
# # Download from zip from source and unzip
# url = "http://davidcard.berkeley.edu/data_sets/njmin.zip"
# r = requests.get(url)
# with open(zip_path, "wb") as f:
# f.write(r.content)
# with zipfile.ZipFile(zip_path, "r") as zf:
# zf.extractall("data/original")
# z = zipfile.ZipFile(io.BytesIO(r.content))
# # Read the codebook ( legacy encoding )
# with open(codebook_path, "r", encoding="latin1") as f:
# codebook_text = f.read()
# pattern = re.compile(r"^([A-Z0-9_]+)\s+(\d+)\s+(\d+)\s+\d+\.\d+", re.MULTILINE)
# colspecs = []
# names = []
# for match in pattern.finditer(codebook_text):
# varname = match.group(1).lower()
# start = int(match.group(2)) - 1 # convert to 0-based index for pandas
# end = int(match.group(3))
# names.append(varname)
# colspecs.append((start, end))
# # Read the data file
# df = pd.read_fwf(data_path, colspecs=colspecs, names=names)
# df = df.apply(pd.to_numeric, errors="coerce")
# df.to_csv("data/card-and-kruegar-1993.csv", index=False)
# df.sample(5)