*******************************************************************

*** Workshop: Introduction to the analysis of large-scale data on social connections

***  Cambridge, 12 September 2012


***  Paul Lambert and Dave Griffiths, University of Stirling

*** Workshop organised by the  UK ESRC project 'Social Networks and Occupational Structure',
***   RES-062-23-2497, www.camsis.stir.ac.uk/sonocs

*******************************************************************



*******************************************************************
*******************************************************************
*** LAB 1: INTRODUCTION TO THE ANALYSIS OF SOCIAL CONNECTIONS DATA (STATA EXAMPLES) 

*******************************************************************
*******************************************************************


** 1) REVIEW - EXAMPLES OF RELEVANT DATA 
** 2) SID ANALYSIS OF THE UK's BHPS 
** 3) ALTERNATIVE ANALYTICAL APPROACHES APPLIED TO THE BHPS DATA 
** 4) ANALYSIS EXAMPLE: ALTERNATIVE INDIVIDUAL LEVEL MODELS APPLIED TO THE BHPS DATA 





*******************************************************************
*******************************************************************



*Preliminaries: 

* Paths for BHPS datasets
global bhps_derived "D:\sonocs2012\data\bhps\" /* Location of supplied BHPS extract file */


* Paths for France 1962 IPUMS-I datasets; 
global fr62_source "D:\sonocs2012\data\france1962\source\" /* 1962 data */
global france_occ_labels "D:\sonocs2012\metadata\france_occ_labels.do"
global france_derived "D:\sonocs2012\data\france1962\derived\"



global path9 "D:\temp\" /* temp directory with write permission */ 

do d:\sonocs2012\syntax\macros\variance_summaries.do 
*do http://www.staff.stir.ac.uk/paul.lambert/essex_summer_school/2011/sub_files/variance_summaries.do
* (Defines some simple programmes for summarising results from random effects models) 

clear
capture clear matrix
set mem 150m 

** {Other installations that may be required are noted at the end of the file: tabplot}


*******************************************************************
*******************************************************************




** 1) REVIEW - EXAMPLES OF DATA ON SOCIAL CONNECTIONS




****************************************************



** 1.1) France, 1962, Household census data example IPUMS data 

** Original data in census format: 
 
use $fr62_source\france_1962_raw.dta, clear

sort serial pernum 
list serial pernum age sex edattan occ in 1/60
** This is a subsample of those aged 20+ 


** Some examples looking at educational homogamy 
xttab edattan, i(serial)

* e.g.: 50.5% of hhlds have at least one person with edattan = 'primary' 
*       70.9% of hhlds with at least one at primary have everyone at primary


egen meaneduc=mean(edattan), by(serial)
gen difeduc=edattan - meaneduc 

graph bar (mean) edattan difeduc, over(fr62a_occ, label(labsize(vsmall))) horizontal ///
   bar(1, bcolor(gs11)) bar(2, bcolor(gs8)) scheme(s1mono) subtitle("France 1962, education and occupation") ///
   legend(order(1 2) label(1 "Mean educ") label(2 "Within household difference"), span) 
* I.e., 'professors' have high education, but are relatively less different in educ level to 
*    other members of their household than some other professional jobs. 




** Example: Data organised in terms of pairs of hw combinations: 
*   (processing for doing this is in lab2.do) 


use $france_derived\fr62_hw_micro2.dta, clear
summarize

sort serial 
numlabel _all, remove
list serial hocc wocc age age_sp in 1/20


tab1 hocc wocc 
tabplot hocc wocc, scheme(s1mono) height(2) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 
* Gives a depiction of the inter-relation between the two responses



** In turn, we typically reduce the data further into a 'table format' dataset

summarize hocc wocc 
tab hocc
gen freq=1
collapse (sum) freq, by(hocc wocc) 
summarize hocc wocc
tab hocc
summarize hocc wocc [fw=freq]
tab hocc [fw=freq]
list in 1/10
gsort -freq
list in 1/20 /* The 20 most common occupations */


**********

** This mimics the construction of a pairs dataset as shown in slides (talk1) : 
**  (joins all possible male and female combinations) 

use $fr62_source\france_1962_raw.dta, clear
sort serial pernum 
list serial pernum age sex edattan occ in 1/60
keep if sex==2
keep serial occ age 
rename occ wocc
rename age age_sp 
sort serial 
summarize /* N = 815k females */
sav $path9\temp.dta, replace
use $fr62_source\france_1962_raw.dta, clear
keep if sex==1
keep serial occ age 
rename occ hocc
sort serial
summarize /* N = 734k males */ 
joinby serial using $path9\temp.dta
summarize /* N = 876k pairs of male-female within household combinations */



****************************************************
****************************************************





** 1.1.2) BHPS household data over time  


** An extract from the UK's BHPS, not for use outside the workshop 
*  (source data available from esds.ac.uk; construction syntax at: 
      www.camsis.stir.ac.uk/sonocs/workshops/wog/
**


use $bhps_derived\bhps_example.dta, clear
codebook, compact

* 'hid' indicates the same household in a specific year
* 'pid' indicates a unique person (the same person may contribute a record on more than one year) 

sort hid pid year 
list hid pid year sex age ghq mcamsis in 1/30 /* Selection of cases from 1991 */
list hid pid year sex age ghq mcamsis in 15000/15050 /* Selection of cases from 1992 */

sort pid year hid 
list hid pid year sex age ghq mcamsis in 55/100 /* Some linked cases potentially spanning two different years of cases from 1991 */


sort hid pid year
list hid pid year sex age esec netesec spesec paesec in 15000/15002 
/* Some 1992 cases with own job, friends job, spouses job, and fathers job  */


** Some examples of calculating data about other people (using Stata functions): 

* Highest household isei: 
capture drop hhmax_isei
egen hhmax_isei=max(isei), by(hid) 
sort hid pid year 
list hid isei hhmax_isei in 100/120

* GHQ of the oldest member of the household
gsort +hid -age
list hid age ghq in 151/170 
capture drop oldest
gen oldest=1
replace oldest=.m if hid[_n-1]==hid
tab oldest
capture drop ghq_old1
gen ghq_old1=ghq*oldest
capture drop ghq_old2
egen ghq_old2=max(ghq_old1), by(hid)
sum ghq_old2
sort hid pid year 
list hid age ghq ghq_old2 in 100/120




* Background - Below is a device to show household size per household in 1991 via a graph
egen taghh=tag(hid year) /* Generates an indicator for one case per hid/year combination */ 
tab taghh /* i.e. 10730 different households in this data */ 
tab1 hhsize ihsize if taghh==1 & year==1991
sav $path9\m5.dta, replace
use $path9\m5.dta, clear
collapse (sum) hfreq=taghh if taghh==1 & year==1991, by(hhsize)
rename hhsize hsize
sort hsize
sav $path9\m6.dta, replace
use $path9\m5.dta, clear
collapse (sum) ifreq=taghh if taghh==1 & year==1991, by(ihsize)
rename ihsize hsize
sort hsize
merge hsize using $path9\m6.dta 
drop _merge
graph bar (mean) ifreq hfreq, over(hsize) ///
   bar(1, bcolor(gs12)) bar(2, bcolor(gs8)) scheme(s1mono) ///
  legend(order(1 2) label(1 "Interviews per household") label(2 "Residents per household")) 


****************************************************
****************************************************







****************************************************
****************************************************
** 2) SOCIAL INTERACTION ANALYSIS INTRODUCTION: SID MODELS APPLIED TO THE BHPS DATA 



** Self: Individual sampled
** Alter: Friend ('net*) or Father (pa*) or Spouse (sp*)

use $bhps_derived\bhps_example.dta, clear
codebook, compact
tab1 esec educ4 speduc4 netesec paesec spesec if year==1992, missing
keep if year==1992 /* for simplicity */
tab sex 



** Association model for Occupation to education relationship: 
table esec, c(mean educ4 n educ4) 
ca esec educ4, dim(2) 
cabiplot, title("Own job to own educ") 
graph save $path9\bit1.gph, replace
* Dimension 1 seems to be from high to low education and from more to less advantaged occupation

* To use later, this will generate predicted scores: 
capture drop mod1sc
predict mod1sc, rowscore(1)
table esec, c(mean mod1sc sd mod1sc) /* Ie, the predicted values are scores given to esec categories */ 




** Association model for Occupation to spouses education relationship: 
table esec, c(mean educ4 n educ4 mean speduc4 n speduc4) 
ca esec speduc4, dim(2) 
cabiplot, title("Own job to spouse's educ")

table esec if sex==1, c(mean educ4 n educ4 mean speduc4 n speduc4)  /* Husbands occ and their wives' educ */ 
ca esec speduc4 if sex==1, dim(2) 
cabiplot, title("Husbands job to wife's educ")
graph save $path9\bit2.gph,  replace

capture drop mod2sc
predict mod2sc, rowscore(1)


** Association model for Occupation to spouses occupation relationship: 
tabplot esec spesec, scheme(s1mono) height(1) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 
ca esec spesec, dim(2) 
cabiplot, title("Own job to spouse's job")

tabplot esec spesec if sex==1, scheme(s1mono) height(1) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 
ca esec spesec if sex==1, dim(2) 
cabiplot , title("Husband's job to wife's job")


* Dim 1 is dominated by self-employment, so code this as a psd
capture drop psd1
gen psd1=((esec==4 | esec==5) & (spesec==4 | spesec==5) )
ca esec spesec if sex==1 & psd1==0, dim(2) 
cabiplot , title("Husband's job to wife's job") subtitle("(Excluding self-employed diagonals)")
graph save $path9\bit3.gph,  replace

capture drop mod3sc
predict mod3sc, rowscore(1)


** Association model for occupation to fathers occupation: 
tabplot esec paesec, scheme(s1mono) height(1) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 
ca esec paesec, dim(2) 
cabiplot, title("Own job to father's job")
graph save $path9\bit4.gph,  replace

capture drop mod4sc
predict mod4sc, rowscore(1)




** Association model for occupation to friends occupation: 
tabplot esec netesec, scheme(s1mono) height(1) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 
ca esec netesec, dim(2) 
cabiplot, title("Own job to friend's job")
graph save $path9\bit5.gph,  replace

capture drop mod5sc
predict mod5sc, rowscore(1)



** Assocation model: own isco883 to friend's isco883
capture drop netisco3
gen netisco3=floor(netisco/10)
tab1 isco883 netisco3
recode isco883 -1=.m 
recode netisco3 -100=.m
tabplot isco883 netisco3, scheme(s1mono) height(2) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(tiny)) xtitle("") ytitle("") 
ca isco883 netisco3, dim(2) 
cabiplot, title("Own job ISCO to friend's job ISCO")
* 212/242 are an apparent pseudo-diagonal; in general all 3-digit or 2-digit occs might be considered diagonal
capture drop psd2
gen psd2=(floor(isco883/10) == floor(netisco3/10)  )
tab psd2 
ca isco883 netisco3 if psd2==0, dim(2) 
cabiplot, title("Own job ISCO to friend's job ISCO")
* This doesn't work - taking out too much data
capture drop psd3
gen psd3=(isco88==netisco) | ((isco883==212 | isco883==242) & (netisco3==212 | netisco3==242) )
tab psd3 /* Excludes either exact occ match, or 242/3 combinations */
ca isco883 netisco3 if psd3==0, dim(2) 
cabiplot, title("Own job ISCO to friend's job ISCO")
* This seems ok - but there are sparse cells at present so analysis isn't ideal 
graph save $path9\bit6.gph,  replace

capture drop mod6sc
predict mod6sc, rowscore(1)


graph combine $path9\bit1.gph $path9\bit2.gph $path9\bit3.gph ///
    $path9\bit4.gph $path9\bit5.gph $path9\bit5.gph  

pwcorr mod1sc mod2sc mod3sc mod4sc mod5sc mod6sc mcamsis isei, obs sig

graph hbar (mean) mod1sc mod2sc mod3sc mod4sc mod5sc, ///
    over(esec, label(labsize(small))) scheme(s1mono) ///
   legend(order(1 2 3 4 5) label(1 "Own educ") label(2 "Wife's educ") label(3 "Wife's job") ///
    label(4 "Father's job") label(5 "Friend's job") cols(3) span) ///
   bar(1, bcolor(gs8)) bar(2, bcolor(gs12)) bar(3, bcolor(gs7)) bar(4, bcolor(gs13)) ///
   bar(5, bcolor(gs10)) 


** Conclusions: 

** These correspondence analysis models show dimensions of structure in the 
**  patterns of connections between the variables involved, which are either 
**   socio-economic connections (own job to own education) or social distance 
**   connections (own job to job of an alter). 
** The high correlations suggest that social distance patterns reflect socio-economic 
**   structure, and so the dimensions of social distance also serve to map 
**   dimensions of social structure. 







***** More examples from a BHPS data extract (as used in lecture slides) 

use $bhps_derived\newspapers_example.dta, clear
* (This is an extract of cohabiting couples with data on job and newspaper preferences) 

tab1 apaper1 fpaper1
ca apaper1 fpaper1 
cabiplot
tab apaper1 fpaper1 
* beware - there are too many diagonals here - the analysis relies on off-diagonal patterns 
* SID analysis more or less shows highbrow/lowbrow pattern, but there's something odd about guardian readers
* (Note - with many more cases, I've done examples of this analysis where the highbrow/lowbrow 
*    dimension is clearly the main dimension of SID patterns of newspaper readership - PL)



tab1 ajbsoc fjbsoc 
ca ajbsoc fjbsoc
cabiplot 
cabiplot, nocol title("SID analysis of Husband-wife jobs") scheme(s1mono) ///
   note("") 


** Some other relevant models include: 

ca apaper1 ajbsoc
cabiplot 
* Within the same people (males), relation between job and newspaper

ca fpaper1 fjbsoc
cabiplot 
* Within the same people (females), relation between job and newspaper


ca apaper1 fjbsoc
cabiplot 
* Relation between husband's paper and wife's job
cabiplot, nocol title("SID analysis of Husband newspaper-wife job") scheme(s1mono) ///
   note("") 

ca fpaper1 ajbsoc
cabiplot 
* Relation between husband's job and wife's paper



** Illustration: Jobs scaled by income versus jobs scaled by interactions: 

sum ajbsoc afimn fjbsoc 

ca ajbsoc fjbsoc 
capture drop ca_score 
predict ca_score, rowscore(1)
sum afimn
recode afimn -9/0=.m
capture drop inc_score
egen inc_score=mean(afimn), by(ajbsoc)
table ajbsoc, c(mean ca_score mean inc_score n ca_score n inc_score)

sum inc_score if ~missing(ajbsoc) 
replace inc_score=2 + ((inc_score - r(mean)) / r(sd)) 
sum ca_score if ~missing(ajbsoc) 
replace ca_score= 2 + ((ca_score - r(mean)) / r(sd)) 

graph hbar (mean) ca_score inc_score, over(ajbsoc) ///
   bargap(-30) bar(1, bcolor(gs8)) bar(2, bcolor(gs12)) ///
   legend(order(1 2) label(1 "SID score (spouses job)") label(2 "Income score") ///
     cols(1) pos(5) ring(0) ) ///
   scheme(s1mono) note("Source: Analysis of married males in BHPS. Scores mean standardised plus 2.", span)




*********************************************************





****************************************************
****************************************************




****************************************************
****************************************************
** 3) ANALYSIS EXAMPLE: SOCIAL INTERACTION DISTANCE ANALYSIS OF THE FRENCH 1962 DATA

****************************************************
****************************************************



use $france_derived\fr62_hw_pcs.dta, clear
summarize
summarize [fw=freq] /* This is frequency weighted data */

tab1 hocc wocc [fw=freq]
tabplot hocc wocc [fw=freq], scheme(s1mono) height(2) ///
   xlabel(,labsize(tiny) angle(60)) ylabel(,labsize(vsmall)) xtitle("") ytitle("") 

* First model: no controls
ca hocc wocc [fw=freq], dimensions(2)
cabiplot

* Second model: exclude diagonals and farmers:

capture drop diag
gen diag=(hocc==wocc)
capture drop farm
gen farm=(hocc==0 | hocc==10) & (wocc==0 | wocc==10)
tab diag farm [fw=freq]

ca hocc wocc if diag==0 & farm==0 [fw=freq], dimensions(2)
cabiplot

* Dim1 seems to be stratification

capture drop dim1h
predict dim1h, rowscore(1) /* Husbands dimension 1 scrore */
capture drop dim1w
predict dim1w, rowscore(1) /* Wive's dimension 1 scrore */
summarize dim1h dim1w

* Graph to depict these scores:
graph hbar (mean) dim1h, over(hocc, label(labsize(small))) ///
   scheme(s1mono) bar(1, bcolor(blue*0.8))  title("Men") fxsize(70) 
graph save $path9\bit1.gph, replace
graph hbar (mean) dim1w, over(wocc, label(nolabel)) scheme(s1mono) ///
   bar(1, bcolor(purple*0.7)) title("Women") fxsize(30)
graph save $path9\bit2.gph, replace
graph combine $path9\bit1.gph $path9\bit2.gph, ycommon xcommon

******************************************************
*******************************************************






****************************************************
****************************************************

** 4) ANALYSIS EXAMPLE: ALTERNATIVE INDIVIDUAL LEVEL MODELS APPLIED TO THE BHPS DATA 



use $bhps_derived\bhps_example.dta, clear
codebook, compact

gen fem=(sex==2)
gen age2=age^2
tab mastat
gen cohab=(mastat==1 | mastat==2) 
gen work= (~missing(mcamsis))
gen workmcam=0
replace workmcam=mcamsis if work==1
codebook, compact

** Individual level models at wave 2, ignoring the household context

** Subjective well-being outcome
regress ghq 
est store ghq1
regress ghq fem age age2 cohab work workmcam educ4_1 educ4_3 educ4_4
est store ghq2

** Socio-economic advantage outcome 
regress mcamsis
est store cam1
regress mcamsis fem age age2 cohab educ4_1 educ4_3 educ4_4
est store cam2


** Household context: Alter's influence 

regress ghq fem age age2 cohab work workmcam educ4_1 educ4_3 educ4_4 spghq 
est store ghq3

regress mcamsis fem age age2 cohab educ4_1 educ4_3 educ4_4 spmcamsis 
est store cam3


** Household context: Random effects

xtmixed ghq fem age age2 cohab work workmcam educ4_1 educ4_3 educ4_4 ||hid:, mle variance
est store ghq4
xtm_var

xtmixed mcamsis fem age age2 cohab educ4_1 educ4_3 educ4_4  ||hid:, mle variance
est store cam4
xtm_var



** 'Random slopes' models 

* Warning: slow to estimate
xtmixed ghq fem age age2 cohab work workmcam educ4_1 educ4_3 educ4_4 ///
     ||hid:work, mle variance cov(unstructured) 
est store ghq5
* Allows the effect of work to be varying from household to household

* Warning: slow to estimate
xtmixed mcamsis fem age age2 cohab educ4_1 educ4_3 educ4_4  ///
     ||hid:educ4_4, mle variance cov(unstructured)
est store cam5
* Allows the effect of higher education on outcomes to vary from household to household





** Household context: Fixed effects

xtreg ghq fem age age2 cohab work workmcam educ4_1 educ4_3 educ4_4, i(hid) fe
est store ghq6


xtreg mcamsis fem age age2 cohab educ4_1 educ4_3 educ4_4, i(hid) fe
est store cam6




***************************************************

est table ghq1 ghq2 ghq3 ghq4  ghq6, b(%6.3g) stats(N bic ll r2) star


est table cam1 cam2 cam3 cam4  cam6, b(%7.3g) stats(N bic ll r2) star




****************************************************
****************************************************




*******************************************************
* Misc: Install tabplot if not already installed:

***
* Tabplot uses an extension routine - if it's not already installed in your 
*  stata copy then first run the below lines to allow it to install: 
/*
capture mkdir $path9\stata\
capture mkdir $path9\stata\ado\
net set ado $path9\stata\ado\
adopath + $path9\stata\ado\
findit tabplot
net from http://fmwww.bc.edu/RePEc/bocode/t
net describe tabplot
net install tabplot
*/
***