*************************************************************** *************************************************************** **** Preliminary: Specifying locations/names for data and metadata files global path1 "C:\camsis\countries\portugal\data\2011\" /* IPUMS-I downloaded dat file and do file */ global file4 "C:\data\resources\isco\labels\isco08_labels_2.do" /* value labels for ISCO-08 (www.camsis.stir.ac.uk) */ global path9 "c:\temp\" /* for temporary file storage */ *************************************************************** **** (i) Open source data from IPUMS-I: Portugal 2011 do $path1\ipumsi_00054.do /* downloaded from ipumsi: all Portugal 2011 sample */ /* with sex, occupation of ego and their spouse ('attach characteristics') */ tab1 occ occ_sp /* this is occupation of ego and alter, 3-digit ISCO-08, valid codes 11-962 */ keep if sex==1 & sex_sp==2 & occ >= 11 & occ <= 962 & occ_sp >= 11 & occ_sp <= 962 codebook occ occ_sp, compact /* 69k both-working heterosexual couples, 125 occ units */ /* Acknowledgement: Minnesota Population Center. Integrated Public Use Microdata Series, International: Version 6.4 [Machine-readable database]. Minneapolis: University of Minnesota, 2015. The author wishes to acknowledge the statistical office that provided the underlying data making this research possible: National Institute of Statistics, Portugal. */ sav $path9\file1.dta, replace /* a temporary copy of the husband-wife microdata */ *************************************************************** **** (ii) Identify 'networked occupations' by calculating when occupational combinations are ** over-represented by a certain threshold use $path9\file1.dta, clear rename occ hocc /* standard label for male partner's occupation */ rename occ_sp wocc /* standard label for female partner's occupation */ gen freq = 1 collapse (count) freq, by(hocc wocc) /* data is now in frequency table format */ egen tot=sum(freq) /* total cases across data */ egen nhocc=sum(freq), by(hocc) /* totals in male occupations */ egen nwocc=sum(freq), by(wocc) /* totals in female occupations */ gen phocc=nhocc/tot /* proportion of males in the job */ gen pwocc=nwocc/tot /* proportion of females in the job */ gen ewocc=pwocc*nhocc /* expected number in the h-w combination if connections were random */ gen value=freq/ewocc /* surplus between observations and occurrences (‘representation ratio’) */ gen prop=freq/tot /* proportion the combination occurs */ gen staner = sqrt((prop)*(1 - prop) / tot) /* creates a standard error */ gen prop_min=prop-(1.96*staner) /* lower 95% CI */ gen prop_max=prop+(1.96*staner) /* upper 95% CI */ gen prop_exp=ewocc/tot /* expected proportion of combination */ gen val_min=prop_min/prop_exp /* surplus of combinations, at lower level */ gen val_max=prop_max/prop_exp /* surplus of combinations, at higher level */ ***label variables label variable tot "total number in sample" label variable nhocc "total number of males in occupation" label variable nwocc "total number of females in occupation" label variable phocc "percentage of men in occupation" label variable pwocc "percentage of women in occupation" label variable ewocc "expected number of partnerships" label variable prop "Observed proportion of all ties" label variable prop_exp "Expected proportion of all ties" label variable prop_min "Lower observed proportion of all ties" label variable prop_max "Upper proportion of all ties" label variable value "Observed value of representation ratio" label variable val_min "Lower bound of observed value of representation ratio" label variable val_max "Higher bound of observed value of representation ratio" sav $path9\file2.dta, replace /* temporary copy of the data file */ /* * The same calculations can be generated directly using an online command file designed for this purpose: do http://www.camsis.stir.ac.uk/sonocs/do/pajek.do */ *************************************************************** **** (iii) Exporting data from Stata, selecting ties according to the threshold approach use $path9\file2.dta, clear /* Selected threshold: combination probably occurs at least twice as often as would expect if connections were random (i.e. lower bound of confidence interval for representation ratio exceeds 2) */ sum if val_min>=2 /* checks the data for those cases that make the selected threshold */ keep if val_min>=2 /* drops cases which do not meet the selected threshold */ keep hocc wocc freq /* reduces data to core edgelist content */ sav $path9\portugal_t1.dta, replace /* exports data in Stata format (keeping labels etc) */ outsheet using"$path9\portugal_t1.txt", comma nonames nolabel replace /* also exports the data as a text file (text file is convenient if other software is also to be used) */ ************************************************************** *** (iv) Exporting data from Stata, selecting ties according to the popularity approach use $path9\file2.dta, clear /* Selected threshold: combination is one of the three most common for the occupation, and it occurs at least 5 times in the data */ gsort +hocc -val_min /* sorts the data by occupation and descending order of the threshold */ bysort hocc: gen num=_n /* within occupations gives a rank to each ego-alter occupational permutation */ sum if num <= 3 & freq >= 5 keep if num <= 3 & freq >= 5 /* drops all combinations not within the 3 highest connections within occupations */ keep hocc wocc freq /* reduces data to core edgelist content */ sav $path9\portugal_t2.dta, replace outsheet using"$path9\portugal_t2.txt", comma nonames nolabel replace /* exports the data as a text file */ **************************************************** **** (v) Install Stata's 'nwcommands' library to define network structure * (illustrated below is generic code that installs this extension library) capture mkdir $path9\stata capture mkdir $path9\stata\ado adopath + "$path9\stata\ado" /* code to ensure have somewhere suitable for local installation */ net from http://www.nwcommands.org net set ado "$path9\stata\ado" net install nwcommands-ado *************************************************************** ***** (vi) Use the 'nwcommands' extension to chart the network structures * Example (1): If combination occurs at least twice as often as expected by chance use $path9\portugal_t1.dta, clear summarize * Use NWcommands to define the network structure: capture nwset, clear /* remove existing networks from memory if relevant */ nwset hocc wocc freq, name(occ1) edgelist undirected nwsummarize * Use NWcommands to show a sociogram of the structure: nwplot(occ1), title("PT 2001: RR >= 2") lab labelopt(mlabsize(tiny)) layout(mdsclassical) /// scatteropt(mfcolor(gs13) mlcolor(gs7) msymbol(circle)) scheme(s1mono) * Example (2): Most popular combinations, regardless of RR value, so long as at least 5 instantiations use $path9\portugal_t2.dta, clear summarize * Use NWcommands to define the network structure: capture nwset, clear /* remove existing networks from memory if relevant */ nwset hocc wocc freq, name(occ1) edgelist undirected nwsummarize * Use NWcommands to show a sociogram of the structure (with slightly layout settings): nwplot(occ1), title("PT 2001: Popularity threshold (up to 3 most popular)") /// lab labelopt(mlabsize(tiny)) layout(mdsclassical) /// scatteropt(mfcolor(green*0.5) mlcolor(gs7) msymbol(circle)) scheme(s1mono) *************************************************************** ****** (vii) Illustration of comparable analysis using R /* within R, code such as the below would generate a network sociogram: install.packages("sna") library(sna) install.packages("statnet") library(statnet) ###### ## Occ subsets threshold1 <- read.table("c:/temp/portugal_t1.txt", convert.factors=F) names(threshold1) ## Threshold1 adj_source <- data.frame(hocc=as.factor(threshold1$hocc), wocc=as.factor(threshold1$wocc)) adj_source2 <- data.frame(hocc=threshold1$hocc, wocc=threshold1$wocc) adj1 <- network(adj_source, directed=TRUE, matrix.type="edgelist") class(adj1) summary(adj1) network.size(adj1) network.edgecount(adj1) adj1b <- network(adj_source2, directed=TRUE, matrix.type="edgelist") class(adj1b) summary(adj1b) network.size(adj1b) network.edgecount(adj1b) cellsize1 <- by(threshold1$freq, list(threshold1$occgb), sum) # Gets values of cellsize # Graphical presentation of network connections adj.empty <- network.initialize(5) par(mfrow=c(1,1)) plot.network(adj1, displaylabels=T, boxed.labels=F, main="Threshold: k=3, c=7" , edge.col="grey50" , usearrows=F , object.scale=0.0002 , vertex.cex=cellsize1, vertex.col="grey80", pad=-1, label.cex=0.7, label.pos=2, label.col="grey30") ## */ **************************************************** ** EOF.