Source code for ogusa.psid_data_setup

import numpy as np
import pandas as pd
import os
import pickle
from pandas_datareader import data as web
import datetime
from linearmodels import PanelOLS
from ogusa.constants import PSID_NOMINAL_VARS, PSID_CONSTANT_VARS


try:
    # This is the case when a separate script is calling this function in
    # this module
    CURDIR = os.path.split(os.path.abspath(__file__))[0]
except NameError:
    # This is the case when a Jupyter notebook is calling this function
    CURDIR = os.getcwd()
output_fldr = "io_files"
output_dir = os.path.join(CURDIR, output_fldr)
if not os.access(output_dir, os.F_OK):
    os.makedirs(output_dir)



[docs]
def prep_data(
    data=os.path.join(CURDIR, "..", "data", "PSID", "psid1968to2015.csv.gz")
):
    """
    This script takes PSID data created from psid_download.R and:
    1) Creates variables at the "tax filing unit" (equal to family
        unit in PSID since there is no info on the filing status chosen).
    2) Selects a sample of observations to work with (e.g., dropping
        very old, very low income, etc.).
    3) Computes a measure of lifetime income and places each household
        into a lifetime income percentile group

    Args:
        data (str): path to RData file with PSID data

    Returns:
        panel_li (Pandas DataFrame): household level data with lifetime
            income groups defined
    """
    # Read data from R into pandas dataframe
    raw_df = pd.read_csv(data, compression="gzip")

    # Create unique identifier for each household
    # note that will define a new household if head or spouse changes
    # keep only current heads
    # before 1983, head is relation.head == 1, 1983+ head is given by
    # relation.head == 10

    # Select just those in the SRC sample, which is representative of the
    # population and so will not require the use of sampling weights
    # SRC sample families have 1968 family interview numbers less than 3000
    raw_df = raw_df[raw_df["ID1968"] < 3000].copy()

    raw_df.loc[
        raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 1)],
        "relation.head",
    ] = 10
    raw_df.loc[
        raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 2)],
        "relation.head",
    ] = 20
    head_df = raw_df.loc[
        raw_df.index[
            (raw_df["relation.head"] == 10) & (raw_df["sequence"] == 1)
        ],
        :,
    ]
    head_df.rename(columns={"pid": "head_id"}, inplace=True)
    # keep legal spouse or long term partners
    spouse_df = raw_df.loc[
        raw_df.index[
            (raw_df["relation.head"] >= 20)
            & (raw_df["relation.head"] <= 22)
            & (raw_df["sequence"] == 2)
        ],
        ["pid", "ID1968", "year", "interview_number"],
    ]
    spouse_df.rename(columns={"pid": "spouse_id"}, inplace=True)
    psid_df = head_df.merge(
        spouse_df, how="left", on=["ID1968", "year", "interview_number"]
    )
    # create unique household id for combination of head and a specific spouse
    psid_df["hh_id"] = (psid_df["head_id"] * 1000000) + psid_df[
        "spouse_id"
    ].fillna(0)

    # clean up files no longer need
    del raw_df, head_df, spouse_df

    # Fix ages to increment by one (or two) between survey waves.  They do
    # not always do this because the survey may be asked as different times
    # of year
    min_age_df = psid_df.groupby("hh_id").agg(["min"])["head_age"]
    min_age_df.rename(columns={"min": "min_age"}, inplace=True)
    min_year_df = psid_df.groupby("hh_id").agg(["min"])["year"]
    min_year_df.rename(columns={"min": "min_year"}, inplace=True)
    psid_df = psid_df.merge(min_age_df, on="hh_id", how="left")
    psid_df = psid_df.merge(min_year_df, on="hh_id", how="left")
    psid_df.sort_values(by=["hh_id", "year"], inplace=True)
    psid_df["age"] = psid_df["year"] - psid_df["min_year"] + psid_df["min_age"]

    # clean up
    del min_age_df, min_year_df

    # Deflate nominal variables
    # because surveys ask about prior year
    psid_df["year_data"] = psid_df["year"] - 1

    # create spouse labor income, since not consistent variable name
    # across time
    psid_df["spouse_labor_inc"] = (
        psid_df["spouse_labor_inc_pre1993"]
        + psid_df["spouse_labor_inc_post1993"]
    )
    psid_df.loc[psid_df["year"] == 1993, "spouse_labor_inc"] = psid_df[
        "spouse_labor_inc_post1993"
    ]

    # set beginning and end dates for data
    start = datetime.datetime(1968, 1, 1)
    end = datetime.datetime.today()
    # pull series of interest using pandas_datareader
    fred_data = web.DataReader(["CPIAUCSL"], "fred", start, end)
    # Make data annual by averaging over months in year
    fred_data = fred_data.resample("YE").mean()
    fred_data["year_data"] = fred_data.index.year
    psid_df2 = psid_df.merge(fred_data, how="left", on="year_data")
    psid_df = psid_df2
    cpi_2010 = fred_data.loc[datetime.datetime(2010, 12, 31), "CPIAUCSL"]

    for item in PSID_NOMINAL_VARS:
        psid_df[item] = (psid_df[item] * cpi_2010) / psid_df["CPIAUCSL"]

    # clean up
    del fred_data, psid_df2

    # Fill in  missing values with zeros
    psid_df[PSID_NOMINAL_VARS] = psid_df[PSID_NOMINAL_VARS].fillna(0)
    psid_df[["head_annual_hours", "spouse_annual_hours"]] = psid_df[
        ["head_annual_hours", "spouse_annual_hours"]
    ].fillna(0)

    # Construct family ("filing unit") level variables
    psid_df["incwage_hh"] = (
        psid_df["head_labor_inc"] + psid_df["spouse_labor_inc"]
    )
    psid_df["earninc_hh"] = (
        psid_df["incwage_hh"]
        + psid_df["head_noncorp_bus_labor_income"]
        + psid_df["spouse_noncorp_bus_labor_income"]
    )
    psid_df["businc_hh"] = (
        psid_df["head_noncorp_bus_labor_income"]
        + psid_df["spouse_noncorp_bus_labor_income"]
    )
    # note that PSID doesn't separate hours towards employed and business
    # work
    psid_df["earnhours_hh"] = (
        psid_df["head_annual_hours"] + psid_df["spouse_annual_hours"]
    )
    psid_df["wage_rate"] = psid_df["incwage_hh"] / psid_df["earnhours_hh"]
    psid_df["earn_rate"] = psid_df["earninc_hh"] / psid_df["earnhours_hh"]
    with np.errstate(divide="ignore"):
        psid_df["ln_wage_rate"] = np.log(psid_df["wage_rate"])
        psid_df["ln_earn_rate"] = np.log(psid_df["earn_rate"])
    psid_df["singlemale"] = (psid_df["head_gender"] == 1) & (
        psid_df["marital_status"] != 1
    )
    psid_df["singlefemale"] = (psid_df["head_gender"] == 2) & (
        psid_df["marital_status"] != 1
    )
    psid_df["marriedmalehead"] = (psid_df["head_gender"] == 1) & (
        psid_df["marital_status"] == 1
    )
    psid_df["marriedfemalehead"] = (psid_df["head_gender"] == 2) & (
        psid_df["marital_status"] == 1
    )
    psid_df["married"] = (psid_df["marital_status"] == 1).astype(int)

    # sample selection
    # drop very young, very old, those with very low earnings, and any
    # outliers with very high earnings, those working at least 200 hrs
    # should check to see if we want to drop any particular years... (e.g.,
    # I think some data is missing before 1970)
    psid_df.query(
        "age >= 20 & age <= 80 & incwage_hh >= 5"
        + " & wage_rate >= 5 & wage_rate <= 25000"
        + " & earnhours_hh > 200",
        inplace=True,
    )
    # Indicator for obs being from PSID not interpolated value
    # used to make drops later
    psid_df.sort_values(by=["hh_id", "year"], inplace=True)
    # psid_df[
    #     [
    #         "head_id",
    #         "spouse_id",
    #         "hh_id",
    #         "head_age",
    #         "age",
    #         "spouse_age",
    #         "ID1968",
    #         "year",
    #         "interview_number",
    #         "head_marital_status",
    #         "marital_status",
    #     ]
    # ].to_csv("psid_to_check.csv")
    # The next several lines try to identify and then drop from the sample
    # hh_ids that report more than one type of marital status
    # there are 179 of these, 26 are men who report being married and not at
    # different times, even when a spouse id is not present
    marriedmale_df = psid_df.groupby("hh_id").agg(["max"])["marriedmalehead"]
    singlemale_df = psid_df.groupby("hh_id").agg(["max"])["singlemale"]
    marriedfemale_df = psid_df.groupby("hh_id").agg(["max"])[
        "marriedfemalehead"
    ]
    singlefemale_df = psid_df.groupby("hh_id").agg(["max"])["singlefemale"]
    marriedmale_df.rename(columns={"max": "m_marriedmalehead"}, inplace=True)
    singlemale_df.rename(columns={"max": "m_singlemale"}, inplace=True)
    marriedfemale_df.rename(
        columns={"max": "m_marriedfemalehead"}, inplace=True
    )
    singlefemale_df.rename(columns={"max": "m_singlefemale"}, inplace=True)
    merged_df = marriedmale_df.join(
        [singlemale_df, marriedfemale_df, singlefemale_df],
        how="outer",
        sort=True,
    )
    merged_df["sum_status"] = (
        merged_df["m_singlemale"].astype(int)
        + merged_df["m_singlefemale"].astype(int)
        + merged_df["m_marriedfemalehead"].astype(int)
        + merged_df["m_marriedmalehead"].astype(int)
    )
    merged_df_to_list = merged_df[merged_df["sum_status"] > 1]
    merged_df_to_list.to_csv("hh_id_two_statuses.csv")
    hhid_to_drop = merged_df_to_list.copy()
    hhid_to_drop["keep"] = False
    psid_df = psid_df.merge(hhid_to_drop, on="hh_id", how="left")
    psid_df["keep"].fillna(True, inplace=True)
    psid_df = psid_df[psid_df["keep"]].copy()
    psid_df["in_psid"] = True
    # print number of obs by year
    print(
        "Number of obs by year = ",
        psid_df["hh_id"].groupby([psid_df.year]).agg("count"),
    )
    num_obs_psid = psid_df.shape[0]
    psid_df.sort_values(by=["hh_id", "year"], inplace=True)

    # clean up
    del (
        merged_df_to_list,
        hhid_to_drop,
        marriedmale_df,
        marriedfemale_df,
        singlemale_df,
        singlefemale_df,
    )

    # "fill in" observations - so have observation for each household
    # from age 20-80
    # note that do this before running regression, but that's ok since
    # wages missing here so these obs don't affect regression
    uid = psid_df["hh_id"].unique()
    all_ages = list(range(20, 81))  # for list of ages 20 to 80
    ids_full = np.array([[x] * len(all_ages) for x in list(uid)]).flatten()
    ages = all_ages * len(uid)
    balanced_panel = pd.DataFrame({"hh_id": ids_full, "age": ages})
    rebalanced_data = balanced_panel.merge(
        psid_df, how="left", on=["hh_id", "age"]
    )
    # Backfill and then forward fill variables that are constant over time
    # within hhid
    for item in PSID_CONSTANT_VARS:
        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].bfill()
        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].ffill()

    ### NOTE: we seem to get some cases where the marital status is not constant
    # despite trying to set up the identification of a household such that it
    # has to be.  Why this is happening needs to be checked.

    # Fill in year by doing a cumulative counter within each hh_id and then
    # using the difference between age and this counter to infer what the
    # year should be'
    rebalanced_data.sort_values(["hh_id", "age"], inplace=True)
    rebalanced_data["counter"] = rebalanced_data.groupby("hh_id").cumcount()
    rebalanced_data["diff"] = (
        rebalanced_data["year"] - rebalanced_data["counter"]
    )
    rebalanced_data["diff"].fillna(
        0, inplace=True
    )  # because NaNs if year missing
    max_df = rebalanced_data.groupby("hh_id").agg(["max"])["diff"]
    rebalanced_data = rebalanced_data.join(max_df, how="left", on=["hh_id"])
    rebalanced_data["year"] = (
        rebalanced_data["max"] + rebalanced_data["counter"]
    )

    # clean up
    del max_df, balanced_panel

    ### Check that there are 61 obs for each hh_id

    # create additional variables for first stage regressions
    df = rebalanced_data.reset_index()
    df["age2"] = df["age"] ** 2
    df["age3"] = df["age"] ** 3
    df["age_smale"] = df["age"] * df["singlemale"]
    df["age_sfemale"] = df["age"] * df["singlefemale"]
    df["age_mmale"] = df["age"] * df["marriedmalehead"]
    df["age_mfemale"] = df["age"] * df["marriedfemalehead"]
    df["age_smale2"] = df["age2"] * df["singlemale"]
    df["age_sfemale2"] = df["age2"] * df["singlefemale"]
    df["age_mmale2"] = df["age2"] * df["marriedmalehead"]
    df["age_mfemale2"] = df["age2"] * df["marriedfemalehead"]
    df["age_smale3"] = df["age3"] * df["singlemale"]
    df["age_sfemale3"] = df["age3"] * df["singlefemale"]
    df["age_mmale3"] = df["age3"] * df["marriedmalehead"]
    df["age_mfemale3"] = df["age3"] * df["marriedfemalehead"]

    # clean up
    del rebalanced_data

    # run regressions to impute wages for years not observed in sample
    df.set_index(["hh_id", "year"], inplace=True)
    list_of_statuses = [
        "Single Males",
        "Single Females",
        "Married, Male Head",
        "Married, Female Head",
    ]
    list_of_dfs = [
        df[df["singlemale"]].copy(),
        df[df["singlefemale"]].copy(),
        df[df["marriedmalehead"]].copy(),
        df[df["marriedfemalehead"]].copy(),
    ]
    list_of_dfs_with_fitted_vals = []
    first_stage_model_results = {
        "Names": [
            "Head Age",
            "",
            "Head Age^2",
            "",
            "Head Age^3",
            "",
            "R-Squared",
            "Observations",
            "Households",
        ],
        "Single Males": [],
        "Single Females": [],
        "Married, Male Head": [],
        "Married, Female Head": [],
    }
    for i, data in enumerate(list_of_dfs):
        # Note that including entity and time effects leads to a collinearity
        # I think this is because there are some years at begin and end of
        # sample with just one person
        # mod = PanelOLS(data.ln_wage_rate,
        #                data[['age', 'age2', 'age3']],
        #                weights=data.fam_smpl_wgt_core,
        #                entity_effects=True, time_effects=True)
        mod = PanelOLS(
            data.ln_wage_rate,
            data[["age", "age2", "age3"]],
            entity_effects=True,
        )
        res = mod.fit(cov_type="clustered", cluster_entity=True)
        # print("Summary for ", list_of_statuses[i])
        # print(res.summary)
        # Save model results to dictionary
        first_stage_model_results[list_of_statuses[i]] = [
            res.params["age"],
            res.std_errors["age"],
            res.params["age2"],
            res.std_errors["age2"],
            res.params["age3"],
            res.std_errors["age3"],
            res.rsquared,
            res.nobs,
            res.entity_info["total"],
        ]
        fit_values = res.predict(fitted=True, effects=True, missing=True)
        fit_values["predictions"] = (
            fit_values["fitted_values"] + fit_values["estimated_effects"]
        )
        list_of_dfs_with_fitted_vals.append(
            data.join(fit_values, how="left", on=["hh_id", "year"])
        )

    df_w_fit = pd.concat(list_of_dfs_with_fitted_vals)
    # list_of_dfs_with_fitted_vals[0].append(
    #     list_of_dfs_with_fitted_vals[1].append(
    #         list_of_dfs_with_fitted_vals[2].append(
    #             list_of_dfs_with_fitted_vals[3]
    #         )
    #     )
    # )
    df_w_fit.rename(columns={"predictions": "ln_fillin_wage"}, inplace=True)
    # print(
    #     "Descritpion of data coming out of estimation: ", df_w_fit.describe()
    # )
    # Seems to be the same as going into estimation

    # Compute lifetime income for each filer
    int_rate = 0.04  # assumed interest rate to compute NPV of lifetime income
    time_endow = 4000
    # assumed time endowment - set at 4000 hours !!! May want
    # to change this to be different for single households than married !!!
    df_w_fit["time_wage"] = np.exp(df_w_fit["ln_fillin_wage"]) * time_endow
    df_w_fit["lifetime_inc"] = df_w_fit["time_wage"] * (
        (1 / (1 + int_rate)) ** (df_w_fit["age"] - 20)
    )
    li_df = (df_w_fit[["lifetime_inc"]].groupby(["hh_id"]).sum()).copy()
    # find percentile in distrubtion of lifetime income
    li_df["li_percentile"] = li_df.lifetime_inc.rank(pct=True)
    # Put in bins
    groups = [0.0, 0.25, 0.5, 0.7, 0.8, 0.9, 0.99, 1.0]
    cats_pct = ["0-25", "26-50", "51-70", "71-80", "81-90", "91-99", "100"]
    dummies = pd.get_dummies(
        pd.cut(li_df["li_percentile"], groups, labels=cats_pct)
    )
    li_df = pd.concat([li_df, dummies], axis=1).copy()
    li_df["li_group"] = pd.cut(li_df["li_percentile"], groups)
    deciles = list(np.arange(0.0, 1.1, 0.10))
    cats_10 = ["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10"]
    dummies = pd.get_dummies(
        pd.cut(li_df["li_percentile"], deciles, labels=cats_10)
    )
    li_df = pd.concat([li_df, dummies], axis=1).copy()
    li_df["li_decile"] = pd.cut(li_df["li_percentile"], deciles)

    # Merge lifetime income to panel
    df_w_fit.drop(columns="lifetime_inc", inplace=True)
    df_fit2 = df_w_fit.join(
        li_df, how="left", on=["hh_id"], lsuffix="_x", rsuffix="_y"
    )
    # Drop from balanced panel those that were not in original panel
    df_fit2["in_psid"].fillna(False, inplace=True)
    panel_li = (df_fit2[df_fit2["in_psid"]]).copy()

    # Save dictionary of regression results
    # pickle.dump(
    #     first_stage_model_results,
    #     open(
    #         os.path.join(
    #             CURDIR, "..", "data", "PSID", "first_stage_reg_results.pkl"
    #         ),
    #         "wb",
    #     ),
    # )
    results_df = pd.DataFrame.from_dict(first_stage_model_results)
    results_df.to_csv(
        os.path.join(
            CURDIR, "..", "data", "PSID", "first_stage_reg_results.pkl"
        )
    )

    # Save dataframe
    panel_li.loc["li_group"] = panel_li["li_group"].astype("category")
    panel_li.loc["li_decile"] = panel_li["li_decile"].astype("category")
    panel_li.dropna(axis=0, how="all", inplace=True)
    print(panel_li.keys())
    panel_li.to_csv(
        os.path.join(CURDIR, "..", "data", "PSID", "psid_lifetime_income.csv")
    )

    return panel_li