Source code for ogusa.psid_data_setup

import numpy as np
import pandas as pd
import os
import pickle
from pandas_datareader import data as web
import datetime
from linearmodels import PanelOLS
from ogusa.constants import PSID_NOMINAL_VARS, PSID_CONSTANT_VARS


try:
    # This is the case when a separate script is calling this function in
    # this module
    CURDIR = os.path.split(os.path.abspath(__file__))[0]
except NameError:
    # This is the case when a Jupyter notebook is calling this function
    CURDIR = os.getcwd()
output_fldr = "io_files"
output_dir = os.path.join(CURDIR, output_fldr)
if not os.access(output_dir, os.F_OK):
    os.makedirs(output_dir)


[docs] def prep_data( data=os.path.join(CURDIR, "..", "data", "PSID", "psid1968to2015.csv.gz") ): """ This script takes PSID data created from psid_download.R and: 1) Creates variables at the "tax filing unit" (equal to family unit in PSID since there is no info on the filing status chosen). 2) Selects a sample of observations to work with (e.g., dropping very old, very low income, etc.). 3) Computes a measure of lifetime income and places each household into a lifetime income percentile group Args: data (str): path to RData file with PSID data Returns: panel_li (Pandas DataFrame): household level data with lifetime income groups defined """ # Read data from R into pandas dataframe raw_df = pd.read_csv(data, compression="gzip") # Create unique identifier for each household # note that will define a new household if head or spouse changes # keep only current heads # before 1983, head is relation.head == 1, 1983+ head is given by # relation.head == 10 # Select just those in the SRC sample, which is representative of the # population and so will not require the use of sampling weights # SRC sample families have 1968 family interview numbers less than 3000 raw_df = raw_df[raw_df["ID1968"] < 3000].copy() raw_df.loc[ raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 1)], "relation.head", ] = 10 raw_df.loc[ raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 2)], "relation.head", ] = 20 head_df = raw_df.loc[ raw_df.index[ (raw_df["relation.head"] == 10) & (raw_df["sequence"] == 1) ], :, ] head_df.rename(columns={"pid": "head_id"}, inplace=True) # keep legal spouse or long term partners spouse_df = raw_df.loc[ raw_df.index[ (raw_df["relation.head"] >= 20) & (raw_df["relation.head"] <= 22) & (raw_df["sequence"] == 2) ], ["pid", "ID1968", "year", "interview_number"], ] spouse_df.rename(columns={"pid": "spouse_id"}, inplace=True) psid_df = head_df.merge( spouse_df, how="left", on=["ID1968", "year", "interview_number"] ) # create unique household id for combination of head and a specific spouse psid_df["hh_id"] = (psid_df["head_id"] * 1000000) + psid_df[ "spouse_id" ].fillna(0) # clean up files no longer need del raw_df, head_df, spouse_df # Fix ages to increment by one (or two) between survey waves. They do # not always do this because the survey may be asked as different times # of year min_age_df = psid_df.groupby("hh_id").agg(["min"])["head_age"] min_age_df.rename(columns={"min": "min_age"}, inplace=True) min_year_df = psid_df.groupby("hh_id").agg(["min"])["year"] min_year_df.rename(columns={"min": "min_year"}, inplace=True) psid_df = psid_df.merge(min_age_df, on="hh_id", how="left") psid_df = psid_df.merge(min_year_df, on="hh_id", how="left") psid_df.sort_values(by=["hh_id", "year"], inplace=True) psid_df["age"] = psid_df["year"] - psid_df["min_year"] + psid_df["min_age"] # clean up del min_age_df, min_year_df # Deflate nominal variables # because surveys ask about prior year psid_df["year_data"] = psid_df["year"] - 1 # create spouse labor income, since not consistent variable name # across time psid_df["spouse_labor_inc"] = ( psid_df["spouse_labor_inc_pre1993"] + psid_df["spouse_labor_inc_post1993"] ) psid_df.loc[psid_df["year"] == 1993, "spouse_labor_inc"] = psid_df[ "spouse_labor_inc_post1993" ] # set beginning and end dates for data start = datetime.datetime(1968, 1, 1) end = datetime.datetime.today() # pull series of interest using pandas_datareader fred_data = web.DataReader(["CPIAUCSL"], "fred", start, end) # Make data annual by averaging over months in year fred_data = fred_data.resample("YE").mean() fred_data["year_data"] = fred_data.index.year psid_df2 = psid_df.merge(fred_data, how="left", on="year_data") psid_df = psid_df2 cpi_2010 = fred_data.loc[datetime.datetime(2010, 12, 31), "CPIAUCSL"] for item in PSID_NOMINAL_VARS: psid_df[item] = (psid_df[item] * cpi_2010) / psid_df["CPIAUCSL"] # clean up del fred_data, psid_df2 # Fill in missing values with zeros psid_df[PSID_NOMINAL_VARS] = psid_df[PSID_NOMINAL_VARS].fillna(0) psid_df[["head_annual_hours", "spouse_annual_hours"]] = psid_df[ ["head_annual_hours", "spouse_annual_hours"] ].fillna(0) # Construct family ("filing unit") level variables psid_df["incwage_hh"] = ( psid_df["head_labor_inc"] + psid_df["spouse_labor_inc"] ) psid_df["earninc_hh"] = ( psid_df["incwage_hh"] + psid_df["head_noncorp_bus_labor_income"] + psid_df["spouse_noncorp_bus_labor_income"] ) psid_df["businc_hh"] = ( psid_df["head_noncorp_bus_labor_income"] + psid_df["spouse_noncorp_bus_labor_income"] ) # note that PSID doesn't separate hours towards employed and business # work psid_df["earnhours_hh"] = ( psid_df["head_annual_hours"] + psid_df["spouse_annual_hours"] ) psid_df["wage_rate"] = psid_df["incwage_hh"] / psid_df["earnhours_hh"] psid_df["earn_rate"] = psid_df["earninc_hh"] / psid_df["earnhours_hh"] with np.errstate(divide="ignore"): psid_df["ln_wage_rate"] = np.log(psid_df["wage_rate"]) psid_df["ln_earn_rate"] = np.log(psid_df["earn_rate"]) psid_df["singlemale"] = (psid_df["head_gender"] == 1) & ( psid_df["marital_status"] != 1 ) psid_df["singlefemale"] = (psid_df["head_gender"] == 2) & ( psid_df["marital_status"] != 1 ) psid_df["marriedmalehead"] = (psid_df["head_gender"] == 1) & ( psid_df["marital_status"] == 1 ) psid_df["marriedfemalehead"] = (psid_df["head_gender"] == 2) & ( psid_df["marital_status"] == 1 ) psid_df["married"] = (psid_df["marital_status"] == 1).astype(int) # sample selection # drop very young, very old, those with very low earnings, and any # outliers with very high earnings, those working at least 200 hrs # should check to see if we want to drop any particular years... (e.g., # I think some data is missing before 1970) psid_df.query( "age >= 20 & age <= 80 & incwage_hh >= 5" + " & wage_rate >= 5 & wage_rate <= 25000" + " & earnhours_hh > 200", inplace=True, ) # Indicator for obs being from PSID not interpolated value # used to make drops later psid_df.sort_values(by=["hh_id", "year"], inplace=True) # psid_df[ # [ # "head_id", # "spouse_id", # "hh_id", # "head_age", # "age", # "spouse_age", # "ID1968", # "year", # "interview_number", # "head_marital_status", # "marital_status", # ] # ].to_csv("psid_to_check.csv") # The next several lines try to identify and then drop from the sample # hh_ids that report more than one type of marital status # there are 179 of these, 26 are men who report being married and not at # different times, even when a spouse id is not present marriedmale_df = psid_df.groupby("hh_id").agg(["max"])["marriedmalehead"] singlemale_df = psid_df.groupby("hh_id").agg(["max"])["singlemale"] marriedfemale_df = psid_df.groupby("hh_id").agg(["max"])[ "marriedfemalehead" ] singlefemale_df = psid_df.groupby("hh_id").agg(["max"])["singlefemale"] marriedmale_df.rename(columns={"max": "m_marriedmalehead"}, inplace=True) singlemale_df.rename(columns={"max": "m_singlemale"}, inplace=True) marriedfemale_df.rename( columns={"max": "m_marriedfemalehead"}, inplace=True ) singlefemale_df.rename(columns={"max": "m_singlefemale"}, inplace=True) merged_df = marriedmale_df.join( [singlemale_df, marriedfemale_df, singlefemale_df], how="outer", sort=True, ) merged_df["sum_status"] = ( merged_df["m_singlemale"].astype(int) + merged_df["m_singlefemale"].astype(int) + merged_df["m_marriedfemalehead"].astype(int) + merged_df["m_marriedmalehead"].astype(int) ) merged_df_to_list = merged_df[merged_df["sum_status"] > 1] merged_df_to_list.to_csv("hh_id_two_statuses.csv") hhid_to_drop = merged_df_to_list.copy() hhid_to_drop["keep"] = False psid_df = psid_df.merge(hhid_to_drop, on="hh_id", how="left") psid_df["keep"].fillna(True, inplace=True) psid_df = psid_df[psid_df["keep"]].copy() psid_df["in_psid"] = True # print number of obs by year print( "Number of obs by year = ", psid_df["hh_id"].groupby([psid_df.year]).agg("count"), ) num_obs_psid = psid_df.shape[0] psid_df.sort_values(by=["hh_id", "year"], inplace=True) # clean up del ( merged_df_to_list, hhid_to_drop, marriedmale_df, marriedfemale_df, singlemale_df, singlefemale_df, ) # "fill in" observations - so have observation for each household # from age 20-80 # note that do this before running regression, but that's ok since # wages missing here so these obs don't affect regression uid = psid_df["hh_id"].unique() all_ages = list(range(20, 81)) # for list of ages 20 to 80 ids_full = np.array([[x] * len(all_ages) for x in list(uid)]).flatten() ages = all_ages * len(uid) balanced_panel = pd.DataFrame({"hh_id": ids_full, "age": ages}) rebalanced_data = balanced_panel.merge( psid_df, how="left", on=["hh_id", "age"] ) # Backfill and then forward fill variables that are constant over time # within hhid for item in PSID_CONSTANT_VARS: rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].bfill() rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].ffill() ### NOTE: we seem to get some cases where the marital status is not constant # despite trying to set up the identification of a household such that it # has to be. Why this is happening needs to be checked. # Fill in year by doing a cumulative counter within each hh_id and then # using the difference between age and this counter to infer what the # year should be' rebalanced_data.sort_values(["hh_id", "age"], inplace=True) rebalanced_data["counter"] = rebalanced_data.groupby("hh_id").cumcount() rebalanced_data["diff"] = ( rebalanced_data["year"] - rebalanced_data["counter"] ) rebalanced_data["diff"].fillna( 0, inplace=True ) # because NaNs if year missing max_df = rebalanced_data.groupby("hh_id").agg(["max"])["diff"] rebalanced_data = rebalanced_data.join(max_df, how="left", on=["hh_id"]) rebalanced_data["year"] = ( rebalanced_data["max"] + rebalanced_data["counter"] ) # clean up del max_df, balanced_panel ### Check that there are 61 obs for each hh_id # create additional variables for first stage regressions df = rebalanced_data.reset_index() df["age2"] = df["age"] ** 2 df["age3"] = df["age"] ** 3 df["age_smale"] = df["age"] * df["singlemale"] df["age_sfemale"] = df["age"] * df["singlefemale"] df["age_mmale"] = df["age"] * df["marriedmalehead"] df["age_mfemale"] = df["age"] * df["marriedfemalehead"] df["age_smale2"] = df["age2"] * df["singlemale"] df["age_sfemale2"] = df["age2"] * df["singlefemale"] df["age_mmale2"] = df["age2"] * df["marriedmalehead"] df["age_mfemale2"] = df["age2"] * df["marriedfemalehead"] df["age_smale3"] = df["age3"] * df["singlemale"] df["age_sfemale3"] = df["age3"] * df["singlefemale"] df["age_mmale3"] = df["age3"] * df["marriedmalehead"] df["age_mfemale3"] = df["age3"] * df["marriedfemalehead"] # clean up del rebalanced_data # run regressions to impute wages for years not observed in sample df.set_index(["hh_id", "year"], inplace=True) list_of_statuses = [ "Single Males", "Single Females", "Married, Male Head", "Married, Female Head", ] list_of_dfs = [ df[df["singlemale"]].copy(), df[df["singlefemale"]].copy(), df[df["marriedmalehead"]].copy(), df[df["marriedfemalehead"]].copy(), ] list_of_dfs_with_fitted_vals = [] first_stage_model_results = { "Names": [ "Head Age", "", "Head Age^2", "", "Head Age^3", "", "R-Squared", "Observations", "Households", ], "Single Males": [], "Single Females": [], "Married, Male Head": [], "Married, Female Head": [], } for i, data in enumerate(list_of_dfs): # Note that including entity and time effects leads to a collinearity # I think this is because there are some years at begin and end of # sample with just one person # mod = PanelOLS(data.ln_wage_rate, # data[['age', 'age2', 'age3']], # weights=data.fam_smpl_wgt_core, # entity_effects=True, time_effects=True) mod = PanelOLS( data.ln_wage_rate, data[["age", "age2", "age3"]], entity_effects=True, ) res = mod.fit(cov_type="clustered", cluster_entity=True) # print("Summary for ", list_of_statuses[i]) # print(res.summary) # Save model results to dictionary first_stage_model_results[list_of_statuses[i]] = [ res.params["age"], res.std_errors["age"], res.params["age2"], res.std_errors["age2"], res.params["age3"], res.std_errors["age3"], res.rsquared, res.nobs, res.entity_info["total"], ] fit_values = res.predict(fitted=True, effects=True, missing=True) fit_values["predictions"] = ( fit_values["fitted_values"] + fit_values["estimated_effects"] ) list_of_dfs_with_fitted_vals.append( data.join(fit_values, how="left", on=["hh_id", "year"]) ) df_w_fit = pd.concat(list_of_dfs_with_fitted_vals) # list_of_dfs_with_fitted_vals[0].append( # list_of_dfs_with_fitted_vals[1].append( # list_of_dfs_with_fitted_vals[2].append( # list_of_dfs_with_fitted_vals[3] # ) # ) # ) df_w_fit.rename(columns={"predictions": "ln_fillin_wage"}, inplace=True) # print( # "Descritpion of data coming out of estimation: ", df_w_fit.describe() # ) # Seems to be the same as going into estimation # Compute lifetime income for each filer int_rate = 0.04 # assumed interest rate to compute NPV of lifetime income time_endow = 4000 # assumed time endowment - set at 4000 hours !!! May want # to change this to be different for single households than married !!! df_w_fit["time_wage"] = np.exp(df_w_fit["ln_fillin_wage"]) * time_endow df_w_fit["lifetime_inc"] = df_w_fit["time_wage"] * ( (1 / (1 + int_rate)) ** (df_w_fit["age"] - 20) ) li_df = (df_w_fit[["lifetime_inc"]].groupby(["hh_id"]).sum()).copy() # find percentile in distrubtion of lifetime income li_df["li_percentile"] = li_df.lifetime_inc.rank(pct=True) # Put in bins groups = [0.0, 0.25, 0.5, 0.7, 0.8, 0.9, 0.99, 1.0] cats_pct = ["0-25", "26-50", "51-70", "71-80", "81-90", "91-99", "100"] li_df = li_df.join( pd.get_dummies(pd.cut(li_df["li_percentile"], groups, labels=cats_pct)) ).copy() li_df["li_group"] = pd.cut(li_df["li_percentile"], groups) deciles = list(np.arange(0.0, 1.1, 0.10)) cats_10 = ["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10"] li_df = li_df.join( pd.get_dummies(pd.cut(li_df["li_percentile"], deciles, labels=cats_10)) ).copy() li_df["li_decile"] = pd.cut(li_df["li_percentile"], deciles) # Merge lifetime income to panel df_w_fit.drop(columns="lifetime_inc", inplace=True) df_fit2 = df_w_fit.join( li_df, how="left", on=["hh_id"], lsuffix="_x", rsuffix="_y" ) # Drop from balanced panel those that were not in original panel df_fit2["in_psid"].fillna(False, inplace=True) panel_li = (df_fit2[df_fit2["in_psid"]]).copy() # Save dictionary of regression results # pickle.dump( # first_stage_model_results, # open( # os.path.join( # CURDIR, "..", "data", "PSID", "first_stage_reg_results.pkl" # ), # "wb", # ), # ) results_df = pd.DataFrame.from_dict(first_stage_model_results) results_df.to_csv( os.path.join( CURDIR, "..", "data", "PSID", "first_stage_reg_results.pkl" ) ) # Save dataframe panel_li.loc["li_group"] = panel_li["li_group"].astype("category") panel_li.loc["li_decile"] = panel_li["li_decile"].astype("category") panel_li.dropna(axis=0, how="all", inplace=True) print(panel_li.keys()) panel_li.to_csv( os.path.join(CURDIR, "..", "data", "PSID", "psid_lifetime_income.csv") ) return panel_li