Source code for ogcore.demographics

"""
-------------------------------------------------------------------------------
Functions for generating demographic objects necessary for the OG-USA model. A
list of UN official 3-digit country codes and corresponding 3-character country
abbreviations is available at https://unstats.un.org/unsd/methodology/m49/
-------------------------------------------------------------------------------
"""

# Import packages
import os
import numpy as np
from io import StringIO
import scipy.optimize as opt
import pandas as pd
from ogcore.utils import get_legacy_session
from ogcore import parameter_plots as pp

START_YEAR = 2024
END_YEAR = 2024
UN_COUNTRY_CODE = "840"  # UN code for USA
# create output director for figures
CUR_PATH = os.path.split(os.path.abspath(__file__))[0]
OUTPUT_DIR = os.path.join(CUR_PATH, "..", "data", "OUTPUT", "Demographics")
if os.access(OUTPUT_DIR, os.F_OK) is False:
    os.makedirs(OUTPUT_DIR)


"""
------------------------------------------------------------------------
Define functions
------------------------------------------------------------------------
"""



[docs]
def get_un_data(
    variable_code,
    country_id=UN_COUNTRY_CODE,
    start_year=START_YEAR,
    end_year=END_YEAR,
):
    """
    This function retrieves data from the United Nations Data Portal API
    for UN population data (see
    https://population.un.org/dataportal/about/dataapi)

    Args:
        variable_code (str): variable code for UN data
        country_id (str): country id for UN data
        start_year (int): start year for UN data
        end_year (int): end year for UN data

    Returns:
        df (Pandas DataFrame): DataFrame of UN data
    """
    target = (
        "https://population.un.org/dataportalapi/api/v1/data/indicators/"
        + variable_code
        + "/locations/"
        + country_id
        + "/start/"
        + str(start_year)
        + "/end/"
        + str(end_year)
        + "?format=csv"
    )

    # Check for a file named "un_api_token.txt" in the current directory
    if os.path.exists(os.path.join("un_api_token.txt")):
        with open(os.path.join("un_api_token.txt"), "r") as file:
            UN_TOKEN = file.read().strip()
    else:  # if file not exist, prompt user for token
        try:
            UN_TOKEN = input(
                "Please enter your UN API token (press return if you do not have one): "
            )
            # write the UN_TOKEN to a file to find in the future
            with open(os.path.join("un_api_token.txt"), "w") as file:
                file.write(UN_TOKEN)
        except EOFError:
            UN_TOKEN = ""

    # get data from url
    payload = {}
    headers = {"Authorization": "Bearer " + UN_TOKEN}
    response = get_legacy_session().get(target, headers=headers, data=payload)
    # Check if the request was successful before processing
    if response.status_code == 200:
        csvStringIO = StringIO(response.text)
        df = pd.read_csv(csvStringIO, sep="|", header=1)

        # keep just what is needed from data
        df = df[df.Variant == "Median"]
        df = df[df.Sex == "Both sexes"][["TimeLabel", "AgeLabel", "Value"]]
        df.rename(
            {"TimeLabel": "year", "AgeLabel": "age", "Value": "value"},
            axis=1,
            inplace=True,
        )
        df.loc[df.age == "100+", "age"] = 100
        df.age = df.age.astype(int)
        df.year = df.year.astype(int)
        df = df[df.age < 100]  # need to drop 100+ age category
    else:
        # Read from UN GH Repo:
        print(
            f"Failed to retrieve population data from UN. Reading "
            + " from https://github.com/EAPD-DRB/Population-Data "
            + "instead of UN WPP API"
        )
        country_dict = {
            "840": "USA",
            "710": "ZAF",
            "458": "MYS",
            "356": "IND",
            "826": "UK",
            "360": "IDN",
            "608": "PHL",
            "764": "THA",
            "076": "BRA",
            "410": "KOR",
        }
        un_variable_dict = {
            "68": "fertility_rates",
            "80": "mortality_rates",
            "47": "population",
        }
        country = country_dict[country_id]
        variable = un_variable_dict[variable_code]
        url = (
            "https://raw.githubusercontent.com/EAPD-DRB/"
            + "Population-Data/main/"
            + "Data/{c}/UN_{v}_data.csv".format(c=country, v=variable)
        )
        df = pd.read_csv(url)
        # keep just the years requested
        df = df[(df.year >= start_year) & (df.year <= end_year)]

        # Do we still want to keep the status code for failures?
        # print(
        #     f"Failed to retrieve population data. HTTP status code: {response.status_code}"
        # )
        # assert False

    return df




[docs]
def get_fert(
    totpers=100,
    min_age=0,
    max_age=99,
    country_id=UN_COUNTRY_CODE,
    start_year=START_YEAR,
    end_year=END_YEAR,
    graph=False,
    plot_path=None,
    download_path=None,
):
    """
    This function generates a vector of fertility rates by model period
    age that corresponds to the fertility rate data by age in years.

    Args:
        totpers (int): total number of agent life periods (E+S), >= 3
        min_age (int): age in years at which agents are born, >= 0
        max_age (int): age in years at which agents die with certainty,
            >= 4, < 100 (max age in UN data is 99, 100+ i same group)
        country_id (str): country id for UN data
        start_year (int): start year for UN data
        end_year (int): end year for UN data
        graph (bool): =True if want graphical output
        plot_path (str): path to save fertility rate plot
        download_path (str): path to save fertility rate data

    Returns:
        fert_rates (Numpy array): fertility rates for each year of data
            and model age
        fig (Matplotlib Figure): figure object if graph=True and plot_path=None

    """
    # initialize fert rates array
    fert_rates_2D = np.zeros((end_year + 1 - start_year, totpers))
    # Read UN data
    df = get_un_data(
        "68", country_id=country_id, start_year=start_year, end_year=end_year
    )
    # CLean and rebin data
    for y in range(start_year, end_year + 1):
        df_y = df[(df.age >= min_age) & (df.age <= max_age) & (df.year == y)]
        # put in vector
        fert_rates = df_y.value.values
        # fill in with zeros for ages  < 15 and > 49
        # NOTE: this assumes min_year < 15 and max_age > 49
        fert_rates = np.append(fert_rates, np.zeros(max_age - 49))
        fert_rates = np.append(np.zeros(15 - min_age), fert_rates)
        # divide by 1000 because fertility rates are number of births per
        # 1000 woman and we want births per person (might update to account
        # from fraction men more correctly - below assumes 50/50 men and women)
        fert_rates = fert_rates / 2000
        # Rebin data in the case that model period not equal to one calendar
        # year
        fert_rates = pop_rebin(fert_rates, totpers)
        fert_rates_2D[y - start_year, :] = fert_rates

    if download_path:
        np.savetxt(
            os.path.join(download_path, "fert_rates.csv"),
            fert_rates_2D,
            delimiter=",",
        )

    # Create plots if needed
    if graph:
        if start_year == end_year:
            years_to_plot = [start_year]
        else:
            years_to_plot = [start_year, end_year]
        if plot_path is not None:
            pp.plot_fert_rates(
                [fert_rates_2D],
                start_year=start_year,
                years_to_plot=years_to_plot,
                path=plot_path,
            )
            return fert_rates_2D
        else:
            fig = pp.plot_fert_rates(
                [fert_rates_2D],
                start_year=start_year,
                years_to_plot=years_to_plot,
            )
            return fert_rates_2D, fig
    else:
        return fert_rates_2D




[docs]
def get_mort(
    totpers=100,
    min_age=0,
    max_age=99,
    country_id=UN_COUNTRY_CODE,
    start_year=START_YEAR,
    end_year=END_YEAR,
    graph=False,
    plot_path=None,
    download_path=None,
):
    """
    This function generates a vector of mortality rates by model period
    age.

    Args:
        totpers (int): total number of agent life periods (E+S), >= 3
        min_age (int): age in years at which agents are born, >= 0
        max_age (int): age in years at which agents die with certainty,
            >= 4, < 100 (max age in UN data is 99, 100+ i same group)
        country_id (str): country id for UN data
        start_year (int): start year for UN data
        end_year (int): end year for UN data
        graph (bool): =True if want graphical output
        plot_path (str): path to save mortality rate plot
        download_path (str): path to save mortality rate data

    Returns:
        mort_rates (Numpy array) mortality rates for each year of data
            and model age
        infmort_rate_vec (Numpy array): infant mortality rates for each
        fig (Matplotlib Figure): figure object if graph=True and plot_path=None

    """
    mort_rates_2D = np.zeros((end_year + 1 - start_year, totpers))
    infmort_rate_vec = np.zeros(end_year + 1 - start_year)
    # Read UN data
    df = get_un_data(
        "80", country_id=country_id, start_year=start_year, end_year=end_year
    )
    # CLean and rebin data
    for y in range(start_year, end_year + 1):
        df_y = df[(df.age >= min_age) & (df.age <= max_age) & (df.year == y)]
        # put in vector
        mort_rates_data = df_y.value.values
        # In UN data, mortality rates for 0 year olds are the infant
        # mortality rates
        infmort_rate = mort_rates_data[0]
        # Rebin data in the case that model period not equal to one calendar
        # year
        # make mort rates those from age 1-100 and set to 1 for age 100
        mort_rates_data = np.append(mort_rates_data[1:], 1.0)
        mort_rates = pop_rebin(mort_rates_data, totpers)
        # put in 2D array
        mort_rates_2D[y - start_year, :] = mort_rates
        infmort_rate_vec[y - start_year] = infmort_rate

    if download_path:
        np.savetxt(
            os.path.join(download_path, "mort_rates.csv"),
            mort_rates_2D,
            delimiter=",",
        )
        np.savetxt(
            os.path.join(download_path, "infmort_rates.csv"),
            infmort_rate_vec,
            delimiter=",",
        )

    # Create plots if needed
    if graph:
        if start_year == end_year:
            years_to_plot = [start_year]
        else:
            years_to_plot = [start_year, end_year]
        if plot_path is not None:
            pp.plot_mort_rates_data(
                mort_rates_2D,
                start_year,
                years_to_plot,
                path=plot_path,
            )
            return mort_rates_2D, infmort_rate_vec
        else:
            fig = pp.plot_mort_rates_data(
                mort_rates_2D,
                start_year,
                years_to_plot,
            )
            return mort_rates_2D, infmort_rate_vec, fig
    else:
        return mort_rates_2D, infmort_rate_vec




[docs]
def get_pop(
    E=20,
    S=80,
    min_age=0,
    max_age=99,
    infer_pop=False,
    fert_rates=None,
    mort_rates=None,
    infmort_rates=None,
    imm_rates=None,
    initial_pop=None,
    pre_pop_dist=None,
    country_id=UN_COUNTRY_CODE,
    start_year=START_YEAR,
    end_year=END_YEAR,
    download_path=None,
):
    """
    Retrieves the population distribution data from the UN data API

    Args:
        E (int): number of model periods in which agent is not
            economically active, >= 1
        S (int): number of model periods in which agent is economically
            active, >= 3
        min_age (int): age in years at which agents are born, >= 0
        max_age (int): age in years at which agents die with certainty,
            >= 4, < 100 (max age in UN data is 99, 100+ i same group)
        infer_pop (bool): =True if want to infer the population
            from the given fertility, mortality, and immigration rates
        fert_rates (Numpy array): fertility rates for each year of data
            and model age
        mort_rates (Numpy array): mortality rates for each year of data
            and model age
        infmort_rates (Numpy array): infant mortality rates for each year
            of data
        imm_rates (Numpy array): immigration rates for reach year of data
            and model age
        initial_pop_data (Pandas DataFrame): initial population data
            for the first year of model calibration (start_year)
        pre_pop_dist (Numpy array): population distribution for the year
            before the initial year for calibration
        country_id (str): country id for UN data
        start_year (int): start year data
        end_year (int): end year for data
        download_path (str): path to save population distribution data

    Returns:
        pop_2D (Numpy array): population distribution over T0 periods
        pre_pop (Numpy array): population distribution one year before
            initial year for calibration of omega_S_preTP
    """
    # Generate time path of the nonstationary population distribution
    # Get path up to end of data year
    pop_2D = np.zeros((end_year + 2 - start_year, E + S))
    if infer_pop:
        if pre_pop_dist is None:
            pre_pop_data = get_un_data(
                "47",
                country_id=country_id,
                start_year=start_year - 1,
                end_year=start_year - 1,
            )
            if download_path:
                pre_pop_data.to_csv(
                    os.path.join(download_path, "raw_pre_pop_data_UN.csv"),
                    index=False,
                )
            pre_pop_sample = pre_pop_data[
                (pre_pop_data["age"] >= min_age)
                & (pre_pop_data["age"] <= max_age)
            ]
            pre_pop = pre_pop_sample.value.values
            pre_pop_dist = pop_rebin(pre_pop, E + S)
        else:
            pre_pop = pre_pop_dist
        if initial_pop is None:
            initial_pop_data = get_un_data(
                "47",
                country_id=country_id,
                start_year=start_year,
                end_year=start_year,
            )
            initial_pop_sample = initial_pop_data[
                (initial_pop_data["age"] >= min_age)
                & (initial_pop_data["age"] <= max_age)
            ]
            initial_pop = initial_pop_sample.value.values
            initial_pop = pop_rebin(initial_pop, E + S)
        # Check that have all necessary inputs to infer the population
        # distribution
        assert not [
            x
            for x in (fert_rates, mort_rates, infmort_rates, imm_rates)
            if x is None
        ]
        len_pop_dist = end_year + 1 - start_year
        pop_2D = np.zeros((len_pop_dist, E + S))
        # set initial population distribution in the counterfactual to
        # the first year of the user provided distribution
        pop_2D[0, :] = initial_pop
        for t in range(1, len_pop_dist):
            # find newborns next period
            newborns = np.dot(fert_rates[t - 1, :], pop_2D[t - 1, :])
            pop_2D[t, 0] = (1 - infmort_rates[t - 1]) * newborns + imm_rates[
                t - 1, 0
            ] * pop_2D[t - 1, 0]
            pop_2D[t, 1:] = (
                pop_2D[t - 1, :-1] * (1 - mort_rates[t - 1, :-1])
                + pop_2D[t - 1, 1:] * imm_rates[t - 1, 1:]
            )
    else:
        # Read UN data
        pop_data = get_un_data(
            "47",
            country_id=country_id,
            start_year=start_year,
            end_year=end_year
            + 2,  # note go to + 2 because needed to infer immigration for end_year
        )
        # CLean and rebin data
        for y in range(start_year, end_year + 2):
            pop_data_sample = pop_data[
                (pop_data["age"] >= min_age)
                & (pop_data["age"] <= max_age)
                & (pop_data["year"] == y)
            ]
            pop = pop_data_sample.value.values
            # Generate the current population distribution given that E+S might
            # be less than max_age-min_age+1
            # age_per_EpS = np.arange(1, E + S + 1)
            pop_EpS = pop_rebin(pop, E + S)
            pop_2D[y - start_year, :] = pop_EpS

        # get population distribution one year before initial year for
        # calibration of omega_S_preTP
        pre_pop_data = get_un_data(
            "47",
            country_id=country_id,
            start_year=start_year - 1,
            end_year=start_year - 1,
        )
        pre_pop_sample = pre_pop_data[
            (pre_pop_data["age"] >= min_age) & (pre_pop_data["age"] <= max_age)
        ]
        pre_pop = pre_pop_sample.value.values

    if download_path:
        np.savetxt(
            os.path.join(download_path, "population_distribution.csv"),
            pop_2D,
            delimiter=",",
        )
        np.savetxt(
            os.path.join(
                download_path, "pre_period_population_distribution.csv"
            ),
            pre_pop,
            delimiter=",",
        )

    return pop_2D, pre_pop




[docs]
def pop_rebin(curr_pop_dist, totpers_new):
    """
    For cases in which totpers (E+S) is less than the number of periods
    in the population distribution data, this function calculates a new
    population distribution vector with totpers (E+S) elements.

    Args:
        curr_pop_dist (Numpy array): population distribution over N
            periods
        totpers_new (int): number of periods to which we are
            transforming the population distribution, >= 3

    Returns:
        curr_pop_new (Numpy array): new population distribution over
            totpers (E+S) periods that approximates curr_pop_dist

    """
    # Number of periods in original data
    assert totpers_new >= 3

    # Number of periods in original data
    totpers_orig = len(curr_pop_dist)
    if int(totpers_new) == totpers_orig:
        curr_pop_new = curr_pop_dist
    elif int(totpers_new) < totpers_orig:
        num_sub_bins = float(10000)
        curr_pop_sub = np.repeat(
            np.float64(curr_pop_dist) / num_sub_bins, num_sub_bins
        )
        len_subbins = (np.float64(totpers_orig * num_sub_bins)) / totpers_new
        curr_pop_new = np.zeros(totpers_new, dtype=np.float64)
        end_sub_bin = 0
        for i in range(totpers_new):
            beg_sub_bin = int(end_sub_bin)
            end_sub_bin = int(np.rint((i + 1) * len_subbins))
            curr_pop_new[i] = curr_pop_sub[beg_sub_bin:end_sub_bin].sum()
        # Return curr_pop_new to single precision float (float32)
        # datatype
        curr_pop_new = np.float32(curr_pop_new)

    return curr_pop_new




[docs]
def get_imm_rates(
    totpers=100,
    min_age=0,
    max_age=99,
    fert_rates=None,
    mort_rates=None,
    infmort_rates=None,
    pop_dist=None,
    country_id=UN_COUNTRY_CODE,
    start_year=START_YEAR,
    end_year=END_YEAR,
    graph=False,
    plot_path=None,
    download_path=None,
):
    """
    Calculate immigration rates by age as a residual given population
    levels in different periods, then output average calculated
    immigration rate. We have to replace the first mortality rate in
    this function in order to adjust the first implied immigration rate

    Args:
        totpers (int): total number of agent life periods (E+S), >= 3
        min_age (int): age in years at which agents are born, >= 0
        max_age (int): age in years at which agents die with certainty,
            >= 4
        fert_rates (Numpy array): fertility rates for each year of data
            and model age
        mort_rates (Numpy array): mortality rates for each year of data
            and model age
        infmort_rates (Numpy array): infant mortality rates for each year
            of data
        pop_dist (Numpy array): population distribution over T0+1 periods
        country_id (str): country id for UN data
        start_year (int): start year for UN data
        end_year (int): end year for UN data
        graph (bool): =True if want graphical output
        plot_path (str): path to save figure to
        download_path (str): path to save immigration rate data

    Returns:
        imm_rates_2D (Numpy array):immigration rates that correspond to
            each year of data and period of life, length E+S

    """
    imm_rates_2D = np.zeros((end_year + 1 - start_year, totpers))
    if fert_rates is None:
        # get fert rates from UN data from initial year to data year
        fert_rates = get_fert(
            totpers, min_age, max_age, country_id, start_year, end_year
        )
    else:
        # ensure that user provided fert_rates and mort rates of same size
        assert fert_rates.shape == mort_rates.shape
    if mort_rates is None:
        # get mort rates from UN data from initial year to data year
        mort_rates, infmort_rates = get_mort(
            totpers, min_age, max_age, country_id, start_year, end_year
        )
    else:
        # ensure that user provided fert_rates and mort rates of same size
        assert fert_rates.shape == mort_rates.shape
        assert infmort_rates is not None
        assert infmort_rates.shape[0] == mort_rates.shape[0]
    if pop_dist is None:
        # need to read UN population data
        df = get_un_data(
            "47",
            country_id=country_id,
            start_year=start_year,
            end_year=end_year + 2,
        )
        pop_dist = np.zeros((end_year + 2 - start_year, totpers))
        for y in range(start_year, end_year + 1):
            pop_t = df[
                (df.age < 100) & (df.age >= 0) & (df.year == y)
            ].value.values
            pop_t = pop_rebin(pop_t, totpers)
            pop_dist[y - start_year, :] = pop_t
    # Make sure shape conforms
    assert pop_dist.shape[1] == mort_rates.shape[1]
    assert pop_dist.shape[0] == end_year - start_year + 2
    for y in range(start_year, end_year + 1):
        pop_t = pop_dist[y - start_year, :]
        pop_tp1 = pop_dist[y + 1 - start_year, :]
        # initialize imm_rate vector
        imm_rates = np.zeros(totpers)
        # back out imm rates by age for each year
        newborns = np.dot(fert_rates[y - start_year, :], pop_t)
        # new born imm_rate
        imm_rates[0] = (
            pop_tp1[0] - (1 - infmort_rates[y - start_year]) * newborns
        ) / pop_t[0]
        # all other age imm_rates
        imm_rates[1:] = (
            pop_tp1[1:] - (1 - mort_rates[y - start_year, :-1]) * pop_t[:-1]
        ) / pop_t[1:]

        imm_rates_2D[y - start_year, :] = imm_rates

    if download_path:
        np.savetxt(
            os.path.join(download_path, "immigration_rates.csv"),
            imm_rates_2D,
            delimiter=",",
        )

    # Create plots if needed
    if graph:
        if start_year == end_year:
            years_to_plot = [start_year]
        else:
            years_to_plot = [start_year, end_year]
        if plot_path is not None:
            pp.plot_imm_rates(
                imm_rates_2D,
                start_year,
                years_to_plot,
                path=plot_path,
            )
            return imm_rates_2D
        else:
            fig = pp.plot_imm_rates(
                imm_rates_2D,
                start_year,
                years_to_plot,
            )
            return imm_rates_2D, fig
    else:
        return imm_rates_2D




[docs]
def immsolve(imm_rates, *args):
    """
    This function generates a vector of errors representing the
    difference in two consecutive periods stationary population
    distributions. This vector of differences is the zero-function
    objective used to solve for the immigration rates vector, similar to
    the original immigration rates vector from get_imm_rates(), that
    sets the steady-state population distribution by age equal to the
    population distribution in period int(1.5*S)

    Args:
        imm_rates (Numpy array):immigration rates that correspond to
            each period of life, length E+S
        args (tuple): (fert_rates, mort_rates, infmort_rates, omega_cur,
            g_n_SS)

    Returns:
        omega_errs (Numpy array): difference between omega_new and
            omega_cur_pct, length E+S

    """
    fert_rates, mort_rates, infmort_rates, omega_cur_lev, g_n_SS = args
    omega_cur_pct = omega_cur_lev / omega_cur_lev.sum()
    totpers = len(fert_rates)
    OMEGA = np.zeros((totpers, totpers))
    OMEGA[0, :] = (1 - infmort_rates) * fert_rates + np.hstack(
        (imm_rates[0], np.zeros(totpers - 1))
    )
    OMEGA[1:, :-1] += np.diag(1 - mort_rates[:-1])
    OMEGA[1:, 1:] += np.diag(imm_rates[1:])
    omega_new = np.dot(OMEGA, omega_cur_pct) / (1 + g_n_SS)
    omega_errs = omega_new - omega_cur_pct

    return omega_errs




[docs]
def get_pop_objs(
    E=20,
    S=80,
    T=320,
    min_age=0,
    max_age=99,
    fert_rates=None,
    mort_rates=None,
    infmort_rates=None,
    imm_rates=None,
    infer_pop=False,
    pop_dist=None,
    pre_pop_dist=None,
    country_id=UN_COUNTRY_CODE,
    initial_data_year=START_YEAR - 1,
    final_data_year=START_YEAR + 2,
    GraphDiag=True,
    download_path=None,
):
    """
    This function produces the demographics objects to be used in the
    OG-USA model package.

    Args:
        E (int): number of model periods in which agent is not
            economically active, >= 1
        S (int): number of model periods in which agent is economically
            active, >= 3
        T (int): number of periods to be simulated in TPI, > 2*S
        min_age (int): age in years at which agents are born, >= 0
        max_age (int): age in years at which agents die with certainty,
            >= 4, < 100 (max age in UN data is 99, 100+ i same group)
        fert_rates (array_like): user provided fertility rates, dimensions
            are T0 x E+S
        mort_rates (array_like): user provided mortality rates, dimensions
            are T0 x E+S
        infmort_rates (array_like): user provided infant mortality rates,
            length T0
        imm_rates (array_like): user provided immigration rates, dimensions
            are T0 x E+S
        infer_pop (bool): =True if want to infer the population
        pop_dist (array_like): user provided population distribution,
            dimensions are T0+1 x E+S
        pre_pop_dist (array_like): user provided population distribution
            for the year before the initial year for calibration,
            length E+S
        country_id (str): country id for UN data
        initial_data_year (int): initial year of data to use
            (not relevant if have user provided data)
        final_data_year (int): final year of data to use,
            T0=initial_year-final_year + 1
        pop_dist (array_like): user provided population distribution, last
            dimension is of length E+S
        GraphDiag (bool): =True if want graphical output and printed
                diagnostics

    Returns:
        pop_dict (dict): includes:
            omega_path_S (Numpy array), time path of the population
                distribution from the current state to the steady-state,
                size T+S x S
            g_n_SS (scalar): steady-state population growth rate
            omega_SS (Numpy array): normalized steady-state population
                distribution, length S
            surv_rates (Numpy array): survival rates that correspond to
                each model period of life, length S
            mort_rates (Numpy array): mortality rates that correspond to
                each model period of life, length S
            g_n_path (Numpy array): population growth rates over the time
                path, length T + S

    """
    # TODO: this function does not generalize with T.
    # It assumes one model period is equal to one calendar year in the
    # time dimesion (it does adjust for S, however)
    T0 = (
        final_data_year - initial_data_year + 1
    )  # number of periods until constant fertility and mortality rates
    print(
        "Demographics data: Initial Data year = ",
        initial_data_year,
        ", Final Data year = ",
        final_data_year,
    )
    assert E + S <= max_age - min_age + 1
    assert initial_data_year >= 2011 and initial_data_year <= 2100 - 1
    assert final_data_year >= 2011 and final_data_year <= 2100 - 1
    # Ensure that the last year of data used is before SS transition assumed
    # Really, it will need to be well before this
    assert final_data_year > initial_data_year
    assert final_data_year < initial_data_year + T
    assert (
        T > 2 * T0
    )  # ensure time path 2x as long as allows rates to fluctuate
    if imm_rates is not None and pop_dist is None:
        assert (
            infer_pop is True
        )  # if pass immigration rates, need to infer population

    # Get fertility rates if not provided
    if fert_rates is None:
        # get fert rates from UN data from initial year to data year
        fert_rates = get_fert(
            E + S,
            min_age,
            max_age,
            country_id,
            initial_data_year,
            final_data_year,
            download_path=download_path,
        )
    else:
        # ensure that user provided fert_rates are of the correct shape
        assert fert_rates.shape[0] == T0
        assert fert_rates.shape[-1] == E + S
    # Extrapolate fertility rates for the rest of the transition path
    # the implicit assumption is that they are constant after the
    # last year of UN or user provided data
    fert_rates = np.concatenate(
        (
            fert_rates,
            np.tile(
                fert_rates[-1, :].reshape(1, E + S),
                (T + S - fert_rates.shape[0], 1),
            ),
        ),
        axis=0,
    )
    # Get mortality rates if not provided
    if mort_rates is None:
        # get mort rates from UN data from initial year to data year
        mort_rates, infmort_rates = get_mort(
            E + S,
            min_age,
            max_age,
            country_id,
            initial_data_year,
            final_data_year,
            download_path=download_path,
        )
    else:
        # ensure that user provided mort_rates are of the correct shape
        assert mort_rates.shape[0] == T0
        assert mort_rates.shape[-1] == E + S
        assert infmort_rates is not None
        assert infmort_rates.shape[0] == mort_rates.shape[0]
    # Extrapolate mortality rates for the rest of the transition path
    # the implicit assumption is that they are constant after the
    # last year of UN or user provided data
    mort_rates = np.concatenate(
        (
            mort_rates,
            np.tile(
                mort_rates[-1, :].reshape(1, E + S),
                (T + S - mort_rates.shape[0], 1),
            ),
        ),
        axis=0,
    )
    infmort_rates = np.concatenate(
        (
            infmort_rates,
            np.tile(infmort_rates[-1], (T + S - infmort_rates.shape[0])),
        )
    )
    mort_rates_S = mort_rates[:, E:]
    # Get population distribution if not provided
    # or if just provide initial pop and infer_pop=True
    if (pop_dist is None) or (pop_dist is not None and infer_pop is True):
        if infer_pop:
            if pop_dist is not None:
                initial_pop = pop_dist[0, :].reshape(1, pop_dist.shape[-1])
            else:
                initial_pop = None
            pop_2D, pre_pop = get_pop(
                E,
                S,
                min_age,
                max_age,
                infer_pop,
                fert_rates,
                mort_rates,
                infmort_rates,
                imm_rates,
                initial_pop,
                pre_pop_dist,
                country_id,
                initial_data_year,
                final_data_year,
                download_path=download_path,
            )
        else:
            pop_2D, pre_pop = get_pop(
                E,
                S,
                min_age,
                max_age,
                country_id=country_id,
                start_year=initial_data_year,
                end_year=final_data_year,
                download_path=download_path,
            )
    else:
        # Check first dims of pop_dist as input by user
        print("T0 = ", T0)
        assert pop_dist.shape[0] == T0 + 1  # population needs to be
        # one year longer in order to find immigration rates
        assert pop_dist.shape[-1] == E + S
        # Check that pre_pop specified
        assert pre_pop_dist is not None
        assert pre_pop_dist.shape[0] == pop_dist.shape[1]
        pre_pop = pre_pop_dist
        # Create 2D array of population distribution
        pop_2D = np.zeros((T0 + 1, E + S))
        for t in range(T0 + 1):
            pop_EpS = pop_rebin(pop_dist[t, :], E + S)
            pop_2D[t, :] = pop_EpS
    # Get percentage distribution for S periods for pre-TP period
    pre_pop_EpS = pop_rebin(pre_pop, E + S)
    # Get immigration rates if not provided
    if imm_rates is None:
        imm_rates_orig = get_imm_rates(
            E + S,
            min_age,
            max_age,
            fert_rates,
            mort_rates,
            infmort_rates,
            pop_2D,
            country_id,
            initial_data_year,
            final_data_year,
            download_path=download_path,
        )
    else:
        # ensure that user provided imm_rates are of the correct shape
        assert imm_rates.shape[0] == T0
        assert imm_rates.shape[-1] == E + S
        imm_rates_orig = imm_rates
    # Extrapolate immigration rates for the rest of the transition path
    # the implicit assumption is that they are constant after the
    # last year of UN or user provided data
    imm_rates_orig = np.concatenate(
        (
            imm_rates_orig,
            np.tile(
                imm_rates_orig[-1, :].reshape(1, E + S),
                (T + S - imm_rates_orig.shape[0], 1),
            ),
        ),
        axis=0,
    )
    # If the population distribution was given, check it for consistency
    # with the fertility, mortality, and immigration rates
    # if pop_dist is not None and not infer_pop:
    # len_pop_dist = pop_dist.shape[0]
    # pop_counter_2D = np.zeros((len_pop_dist, E + S))
    len_pop_dist = pop_2D.shape[0]
    pop_counter_2D = np.zeros((len_pop_dist, E + S))
    # set initial population distribution in the counterfactual to
    # the first year of the user provided distribution
    # pop_counter_2D[0, :] = pop_dist[0, :]
    pop_counter_2D[0, :] = pop_2D[0, :]
    for t in range(1, len_pop_dist):
        # find newborns next period
        # newborns = np.dot(fert_rates[t - 1, :], pop_counter_2D[t - 1, :])

        # pop_counter_2D[t, 0] = (
        #     1 - infmort_rates[t - 1]
        # ) * newborns + imm_rates[t - 1, 0] * pop_counter_2D[t - 1, 0]
        # pop_counter_2D[t, 1:] = (
        #     pop_counter_2D[t - 1, :-1] * (1 - mort_rates[t - 1, :-1])
        #     + pop_counter_2D[t - 1, 1:] * imm_rates_orig[t - 1, 1:]
        # )
        newborns = np.dot(fert_rates[t - 1, :], pop_counter_2D[t - 1, :])

        pop_counter_2D[t, 0] = (
            1 - infmort_rates[t - 1]
        ) * newborns + imm_rates_orig[t - 1, 0] * pop_counter_2D[t - 1, 0]
        pop_counter_2D[t, 1:] = (
            pop_counter_2D[t - 1, :-1] * (1 - mort_rates[t - 1, :-1])
            + pop_counter_2D[t - 1, 1:] * imm_rates_orig[t - 1, 1:]
        )
    # Check that counterfactual pop dist is close to pop dist given
    # assert np.allclose(pop_counter_2D, pop_dist)
    assert np.allclose(pop_counter_2D, pop_2D)

    """"
    CHANGE - in OG-Core, we are implicitLy assuming pre-TP rates of mortality,
    fertility, and immigration are the same as the period 0 rates.

    So let's just infer the pre-pop_dist from those.
    """
    pop1 = pop_2D[0, :]
    fert0 = fert_rates[0, :]
    mort0 = mort_rates[0, :]
    infmort0 = infmort_rates[0]
    imm0 = imm_rates_orig[0, :]
    pre_pop_guess = pop1.copy()

    # I can't solve this analytically, so set up a system of equation
    # to solve
    def pre_pop_solve(pre_pop_guess, pop1, fert0, mort0, infmort0, imm0):
        pre_pop = pre_pop_guess
        errors = np.zeros(E + S)
        errors[0] = pop1[0] - (
            (1 - infmort0) * (fert0 * pre_pop).sum() + imm0[0] * pre_pop[0]
        )
        errors[1:] = pop1[1:] - (
            pre_pop[:-1] * (1 - mort0[:-1]) + pre_pop[1:] * imm0[1:]
        )
        # print("Max error = ", np.abs(errors).max())
        return errors

    opt_res = opt.root(
        pre_pop_solve,
        pre_pop_guess,
        args=(pop1, fert0, mort0, infmort0, imm0),
        method="lm",
    )
    pre_pop = opt_res.x
    print(
        "Success? ",
        opt_res.success,
        ", Max diff = ",
        np.abs(opt_res.fun).max(),
    )
    pre_pop_EpS = pop_rebin(pre_pop, E + S)

    # Check result
    initial_pop_counter = np.zeros(E + S)
    newborns = (fert_rates[0, :] * pre_pop[:]).sum()
    initial_pop_counter[0] = (
        1 - infmort_rates[0]
    ) * newborns + imm_rates_orig[0, 0] * pre_pop[0]
    initial_pop_counter[1:] = (
        pre_pop[:-1] * (1 - mort_rates[0, :-1])
        + pre_pop[1:] * imm_rates_orig[0, 1:]
    )
    # Test that using pre pop get to pop in period 1
    print("Max diff = ", np.abs(pop_2D[0, :] - initial_pop_counter).max())
    # assert np.allclose(initial_pop_counter, pop_2D[0, :])

    # Create the transition matrix for the population distribution
    # from T0 going forward (i.e., past when we have data on forecasts)
    OMEGA_orig = np.zeros((E + S, E + S))
    OMEGA_orig[0, :] = (1 - infmort_rates[-1]) * fert_rates[-1, :] + np.hstack(
        (imm_rates_orig[-1, 0], np.zeros(E + S - 1))
    )
    OMEGA_orig[1:, :-1] += np.diag(1 - mort_rates[-1, :-1])
    OMEGA_orig[1:, 1:] += np.diag(imm_rates_orig[-1, 1:])

    # Solve for steady-state population growth rate and steady-state
    # population distribution by age using eigenvalue and eigenvector
    # decomposition
    eigvalues, eigvectors = np.linalg.eig(OMEGA_orig)
    g_n_SS = (eigvalues[np.isreal(eigvalues)].real).max() - 1
    eigvec_raw = eigvectors[
        :, (eigvalues[np.isreal(eigvalues)].real).argmax()
    ].real
    omega_SS_orig = eigvec_raw / eigvec_raw.sum()

    # Generate time path of the population distribution after final
    # year of data
    omega_path_lev = np.zeros((T + S, E + S))
    pop_curr = pop_2D[T0 - 1, :]
    omega_path_lev[:T0, :] = pop_2D[:T0, :]
    for per in range(T0, T + S):
        pop_next = np.dot(OMEGA_orig, pop_curr)
        omega_path_lev[per, :] = pop_next.copy()
        pop_curr = pop_next.copy()

    # Force the population distribution after 1.5*S periods to be the
    # steady-state distribution by adjusting immigration rates, holding
    # constant mortality, fertility, and SS growth rates
    imm_tol = 1e-14
    fixper = int(1.5 * S + T0)
    omega_SSfx = omega_path_lev[fixper, :] / omega_path_lev[fixper, :].sum()
    imm_objs = (
        fert_rates[fixper, :],
        mort_rates[fixper, :],
        infmort_rates[fixper],
        omega_path_lev[fixper, :],
        g_n_SS,
    )
    imm_fulloutput = opt.fsolve(
        immsolve,
        imm_rates_orig[fixper, :],
        args=(imm_objs),
        full_output=True,
        xtol=imm_tol,
    )
    imm_rates_adj = imm_fulloutput[0]
    imm_diagdict = imm_fulloutput[1]
    omega_path_S = omega_path_lev[:, -S:] / (
        omega_path_lev[:, -S:].sum(axis=1).reshape((T + S, 1))
    )
    omega_path_S[fixper:, :] = np.tile(
        omega_path_S[fixper, :].reshape((1, S)), (T + S - fixper, 1)
    )
    g_n_path = np.zeros(T + S)
    g_n_path[1:] = (
        omega_path_lev[1:, -S:].sum(axis=1)
        - omega_path_lev[:-1, -S:].sum(axis=1)
    ) / omega_path_lev[:-1, -S:].sum(axis=1)
    g_n_path[0] = (
        omega_path_lev[0, -S:].sum() - pre_pop_EpS[-S:].sum()
    ) / pre_pop_EpS[-S:].sum()
    g_n_path[fixper + 1 :] = g_n_SS
    omega_S_preTP = pre_pop_EpS[-S:] / pre_pop_EpS[-S:].sum()
    imm_rates_mat = np.concatenate(
        (
            imm_rates_orig[:fixper, E:],
            np.tile(
                imm_rates_adj[E:].reshape(1, S),
                (T + S - fixper, 1),
            ),
        ),
        axis=0,
    )

    if GraphDiag:
        # Check whether original SS population distribution is close to
        # the period-T population distribution
        omegaSSmaxdif = np.absolute(
            omega_SS_orig - (omega_path_lev[T, :] / omega_path_lev[T, :].sum())
        ).max()
        if omegaSSmaxdif > 0.0003:
            print(
                "POP. WARNING: Max. abs. dist. between original SS "
                + "pop. dist'n and period-T pop. dist'n is greater than"
                + " 0.0003. It is "
                + str(omegaSSmaxdif)
                + "."
            )
        else:
            print(
                "POP. SUCCESS: orig. SS pop. dist is very close to "
                + "period-T pop. dist'n. The maximum absolute "
                + "difference is "
                + str(omegaSSmaxdif)
                + "."
            )

        # Plot the adjusted steady-state population distribution versus
        # the original population distribution. The difference should be
        # small
        omegaSSvTmaxdiff = np.absolute(omega_SS_orig - omega_SSfx).max()
        if omegaSSvTmaxdiff > 0.0003:
            print(
                "POP. WARNING: The maximum absolute difference "
                + "between any two corresponding points in the original"
                + " and adjusted steady-state population "
                + "distributions is"
                + str(omegaSSvTmaxdiff)
                + ", "
                + "which is greater than 0.0003."
            )
        else:
            print(
                "POP. SUCCESS: The maximum absolute difference "
                + "between any two corresponding points in the original"
                + " and adjusted steady-state population "
                + "distributions is "
                + str(omegaSSvTmaxdiff)
            )

        # Print whether or not the adjusted immigration rates solved the
        # zero condition
        immtol_solved = np.absolute(imm_diagdict["fvec"].max()) < imm_tol
        if immtol_solved:
            print(
                "POP. SUCCESS: Adjusted immigration rates solved "
                + "with maximum absolute error of "
                + str(np.absolute(imm_diagdict["fvec"].max()))
                + ", which is less than the tolerance of "
                + str(imm_tol)
            )
        else:
            print(
                "POP. WARNING: Adjusted immigration rates did not "
                + "solve. Maximum absolute error of "
                + str(np.absolute(imm_diagdict["fvec"].max()))
                + " is greater than the tolerance of "
                + str(imm_tol)
            )

        # Test whether the steady-state growth rates implied by the
        # adjusted OMEGA matrix equals the steady-state growth rate of
        # the original OMEGA matrix
        OMEGA2 = np.zeros((E + S, E + S))
        OMEGA2[0, :] = (1 - infmort_rates[-1]) * fert_rates[-1, :] + np.hstack(
            (imm_rates_adj[0], np.zeros(E + S - 1))
        )
        OMEGA2[1:, :-1] += np.diag(1 - mort_rates[-1, :-1])
        OMEGA2[1:, 1:] += np.diag(imm_rates_adj[1:])
        eigvalues2, eigvectors2 = np.linalg.eig(OMEGA2)
        g_n_SS_adj = (eigvalues[np.isreal(eigvalues2)].real).max() - 1
        if np.max(np.absolute(g_n_SS_adj - g_n_SS)) > 10 ** (-8):
            print(
                "FAILURE: The steady-state population growth rate"
                + " from adjusted OMEGA is different (diff is "
                + str(g_n_SS_adj - g_n_SS)
                + ") than the steady-"
                + "state population growth rate from the original"
                + " OMEGA."
            )
        elif np.max(np.absolute(g_n_SS_adj - g_n_SS)) <= 10 ** (-8):
            print(
                "SUCCESS: The steady-state population growth rate"
                + " from adjusted OMEGA is close to (diff is "
                + str(g_n_SS_adj - g_n_SS)
                + ") the steady-"
                + "state population growth rate from the original"
                + " OMEGA."
            )

        # Do another test of the adjusted immigration rates. Create the
        # new OMEGA matrix implied by the new immigration rates. Plug in
        # the adjusted steady-state population distribution. Hit is with
        # the new OMEGA transition matrix and it should return the new
        # steady-state population distribution
        omega_new = np.dot(OMEGA2, omega_SSfx)
        omega_errs = np.absolute(omega_new - omega_SSfx)
        print(
            "The maximum absolute difference between the adjusted "
            + "steady-state population distribution and the "
            + "distribution generated by hitting the adjusted OMEGA "
            + "transition matrix is "
            + str(omega_errs.max())
        )

        # Plot the original immigration rates versus the adjusted
        # immigration rates
        immratesmaxdiff = np.absolute(imm_rates_orig - imm_rates_adj).max()
        print(
            "The maximum absolute distance between any two points "
            + "of the original immigration rates and adjusted "
            + "immigration rates is "
            + str(immratesmaxdiff)
        )

        # plots
        age_per_EpS = np.arange(1, E + S + 1)
        pp.plot_omega_fixed(
            age_per_EpS, omega_SS_orig, omega_SSfx, E, S, path=OUTPUT_DIR
        )
        pp.plot_imm_fixed(
            age_per_EpS,
            imm_rates_orig[fixper - 1, :],
            imm_rates_adj,
            E,
            S,
            path=OUTPUT_DIR,
        )
        pp.plot_population_path(
            age_per_EpS,
            omega_path_lev,
            omega_SSfx,
            initial_data_year,
            initial_data_year,
            initial_data_year,
            S,
            path=OUTPUT_DIR,
        )

    # Return objects in a dictionary
    pop_dict = {
        "omega": omega_path_S,
        "g_n_ss": g_n_SS,
        "omega_SS": omega_SSfx[-S:] / omega_SSfx[-S:].sum(),
        "rho": mort_rates_S,
        "g_n": g_n_path,
        "imm_rates": imm_rates_mat,
        "omega_S_preTP": omega_S_preTP,
    }

    return pop_dict