/* Please do not attempt to run this program without reading the accompanying documentation. */ version 8.0 set more off prog drop _all capture log close clear global ver = "2.2.0" /* File: master.do Date: 2012 Desc: Master do-file for creating CEPR consistent extracts of SIPP. Note: See copyright notice at the end of this program. */ /* User-set initial macros */ /* This program is designed to clean multiple panels of the SIPP data and can be adapted to generate final data files by month or by wave. The initial macro set ups define the structure of the final files, as well as set up directories, etc. Each of these initial macros must be evaluated by the user -- you -- to determine what values serve your purposes and to define your directories. */ /*Panel and Year Macros*/ * Panel: 08, 04, 01, 96, 93, 92, 91, 90 global p = "08" local p = "$p" // Set D topcoding requires these macros, the beginning and end year of the panel global y1 = 2008 global y2 = 2011 /* These must be the directories that your data is stored in, where the local macro `p' indicates the two-digit panel year set above, l stands for Longitudinal files, w stands for Core files, and t stands for Topical Module files. Do not change the macro names, only change the directory strucutre if you choose not use these names. */ /*Windows vs. GNU/Linux*/ global gnulin = 1 /*Set gnulin=0 if you run Windows; 1 if GNU/Linux*/ if $gnulin==1 { /*root directory*/ global rootdir "/home/ben/data/sipp/cepr/" /*data directories*/ global rawdata "$rootdir/raw" global tempdata "/tmp" /*temporary data directory*/ global output "$rootdir/sipp$p" global outputq "$rootdir/setdata/sipp" /*needed for 92/93 panels, Set E*/ /*extracted raw data filenames*/ global f "$rawdata/sipp`p'l" global c "$rawdata/sipp`p'w" global t "$rawdata/sipp`p't" /*program and log directories*/ global programs "$rootdir/do/recode" global log "$programs" global codebooks "$programs" } if $gnulin==0 { /*root directory*/ global rootdir "f:\_files\ceprdata\sipp" /*data directories*/ global rawdata "$rootdir\rawdata" global tempdata "$rootdir\tempdata" global output "$rootdir\setdata\sipp$p" global outputq "$rootdir\setdata\sipp" /*needed for 92/93 panels, Set E*/ /*extracted raw data filenames*/ global f "$rawdata\sipp`p'l" global c "$rawdata\sipp`p'w" global t "$rawdata\sipp`p't" /*program & log directories*/ global programs "$rootdir\programs\recode" global log "$rootdir\programs\recode" global codebooks "$rootdir\codebooks\" } * Time frame (to facilitate ease of programming) global tf = "wave srefmon" * Identifying/merging variables /* Note that is unique, but to merge data from the separate sub-files, you need to match not only the person, but the month (here as the reference month rather than calendar month) and . Thus, these variables must be in every subfile and are sorted on in order to facilitate merging later on. */ global ids = "id wave srefmon" global idw = "id wave rot" global idr = "id rot" /*for TM-only files*/ /* Time frame program */ /* This program will drop the first three months of each wave, thus generating a SIPP dataset by person-wave, rather than by person-month. To help satisfy memory constraints, one could run wv after each pull (before each clean).*/ capture program drop wv prog define wv sort id wave srefmon egen seam_ = max(srefmon), by(id wave) sort id wave quietly by id wave: gen byte seam = seam_==srefmon label var seam "last month of wave" drop seam_ keep if seam==1 table wave, c(mean seam mean srefmon) end label define yesno 1 "yes" 0 "no" /*Set Generator*/ /*SET A - ID & WEIGHTS*/ cd $programs do pull_a_idweights cd $programs do clean_a_idweights /*SET B - DEMOGRAPHICS*/ cd $programs do pull_b_demographics cd $programs do clean_b_demographics /*SET C - HOUSEHOLD & FAMILY*/ cd $programs do pull_c_hhfam cd $programs do clean_c_hhfam /*SET D - EMPLOYMENT*/ cd $programs do pull_d_employment.do cd $programs do clean_d_employment.do /*SET E - CHILDCARE*/ cd $programs do pull_e_childcare cd $programs do clean_e_childcare /*SET F - INCOME*/ cd $programs do pull_f_income cd $programs do clean_f_income /*SET G - INCOME TRANSFERS*/ cd $programs do pull_g_incometransfers cd $programs do clean_g_incometransfers /*SET H - HEALTH INSURANCE*/ cd $programs do pull_h_healthins cd $programs do clean_h_healthins /*SET I - WORK SCHEDULES*/ cd $programs do pull_i_workschedules cd $programs do clean_i_workschedules /*SET J - LEAVE*/ cd $programs do pull_j_leave cd $programs do clean_j_leave /* /*Create Codebooks*/ cd $programs do codebooks.do */ /* Release notes 2.2.0 May 2, 2012 1. Corrected occupation label typos in 2008 panel. 2.1.9 March 1, 2012 1. Added 2008 panel 2.1.7 Sep 14, 2011 1. Corrected primjob variable in Set D of 2004 panel. 2.1.6 Aug 3, 2011 1. Added longitudinal weights and variance-calculating variables. 2.1.5 Feb 13, 2011 1. Amended Census coding error in id variables of Wave 7 of 2004 panel. 2.1.4 Feb 11, 2011 1. Corrected hours variables in Set D of 2004 panel -- missing values were erroneously being mapped to zero hours. 2.1.3 Oct 25, 2010 1. Set H for 2001 panel incorrectly contained dependent observations (as Set H2 is supposed to). Corrected this. 2. Eliminated 500-something observations in 2001 panel containing missing ids, due to observations in the longitudinal weight file not in the core data. 2.1.2 Aug 31, 2010 1. Corrected hispanic variable in 2004 to make consistent with preivous panels. 2. Added ethnic variable to 2004 panel, 3. Used Stata 10's saveold to ensure data compatibility with previous versions of Stata. 2.1.1 Aug 24, 2010 1. Extracts updated to include latest 2001 and 2004 panel data from Census 2. Corrected "yr" variable in Set E of the 2004 panel 2.1 March 31, 2010 1. 2004 data added */ /* USER NOTES This program creates themeatic datasets from the SIPP with variables that are, to the extent possible, consistent across panels. These datasets can be merged back together for analysis. These variables have been validated using outside sources and this analysis is available from CEPR by emailing . The themeatic datasets generated by this program are all in "long" format or "person-month" format. Thus, these programs convert the SIPP longitudinal files from "wide" format into "long" format in order to faciliate ease of programming and merging with the Core and Topical Module data. Each of these themeatic datasets can be merged back together using the following variables: id wave srefmon. To facilitate this, each themeatic dataset is sorted on these variables before saving. This program assumes that the user has extracted the SIPP data into Stata using the CEPR Extraction Programs. Please see the Crosswalk file for information on the raw SIPP variables used in these programs. Please see the Final Variables file and the Codebook for each panel for information on the final variables created by this program. Each file also generates a Codebook, which should match the one that accompanies this program. This way, you can ensure that your SIPP data matches the CEPR SIPP data. In order to run this program, the user need only: (1) Identify the panel year that they wish to clean (below under "User-set initial macros"). (2) Indicate your directory structure (below under "User-set initial macros"). It is highly recommended that you use the following directory structure: Save all *.do files in the following directory: f:\_files\ceprdata\sipp\programs\clean\ Save all *.dta files in the following directory: f:\_files\ceprdata\sipp\data\sipp01\ Where "01" indicates the panel year and should change for each panel to the last two digits of the panel year. (3) Indicate whether or not you wish to add code to reduce the overall size of the data. Please note that you must change the "panel" macro and re-run this program for each panel to be cleaned. To reduce the memory needs for running the SIPP the user could: (1) Add a line of code to run the program after each of the programs. This will keep only one month per wave of data, thus reducing the size of the dataset down to one-fourth of its total size. Please see the Crosswalk for information on which variables are from questions that are asked monthly or wavely. (2) If you are only looking at adults, add a line of code to delete those under age 18 after each program except, except Set C, which requires children. To facilitate this, each themeatic dataset includes . Please direct all questions and comments about this program to . */ /* Acknowledgements I am grateful to Jean Roth for assistance with the data and to Jean Roth and others at the NBER for their efforts to make the SIPP data widely available. Many thanks also to Helen Connolly, John Schmitt, Jeffrey Sisson, Jeffrey Wenger, and the SIPP Outreach Staff at the U.S. Census for helpful discussions about the SIPP data. Bradley Hardy provided valuable research assistance. The construction of this SIPP extract has been funded by a generous grant from the Rockefeller Foundation. The underlying Survey of Income and Program Participation data referenced here are in the public domain. The programs are distributed under the GNU GPL. See end of this file and http://www.gnu.org/licenses/ for details. */