***************************************************************
***************************************************************
**** Preliminary: Specifying locations/names for data and metadata files 
global path1 "C:\camsis\countries\portugal\data\2011\" /* IPUMS-I downloaded dat file and do file */ 
global file4 "C:\data\resources\isco\labels\isco08_labels_2.do" /* value labels for ISCO-08 (www.camsis.stir.ac.uk) */
global path9 "c:\temp\" /* for temporary file storage */
***************************************************************


**** (i) Open source data from IPUMS-I: Portugal 2011

do $path1\ipumsi_00054.do /* downloaded from ipumsi: all Portugal 2011 sample  */ 
                  /*  with sex, occupation of ego and their spouse  ('attach characteristics') */
tab1 occ occ_sp /* this is occupation of ego and alter, 3-digit ISCO-08, valid codes 11-962 */ 
keep if sex==1 & sex_sp==2  & occ >= 11 & occ <= 962 & occ_sp >= 11 & occ_sp <= 962 
codebook occ occ_sp, compact /* 69k both-working heterosexual couples, 125 occ units  */
/* Acknowledgement: 
   Minnesota Population Center. Integrated Public Use Microdata Series, International: Version 6.4 
        [Machine-readable database]. Minneapolis: University of Minnesota, 2015.
    The author wishes to acknowledge the statistical office that provided the underlying data 
      making this research possible: National Institute of Statistics, Portugal.
*/
sav $path9\file1.dta, replace /* a temporary copy of the husband-wife microdata */
***************************************************************

**** (ii) Identify 'networked occupations' by calculating when occupational combinations are 
**            over-represented by a certain threshold

use $path9\file1.dta, clear
rename occ hocc  /* standard label for male partner's occupation */
rename occ_sp wocc  /* standard label for female partner's occupation */
gen freq = 1
collapse (count) freq, by(hocc wocc) /* data is now in frequency table format */ 
egen tot=sum(freq) /* total cases across data */
egen nhocc=sum(freq), by(hocc) /* totals in male occupations */ 
egen nwocc=sum(freq), by(wocc) /* totals in female occupations */ 
gen phocc=nhocc/tot /* proportion of males in the job */ 
gen pwocc=nwocc/tot /* proportion of females in the job */ 
gen ewocc=pwocc*nhocc /* expected number in the h-w combination if connections were random */ 
gen value=freq/ewocc /* surplus between observations and occurrences (‘representation ratio’) */
gen prop=freq/tot /* proportion the combination occurs */
gen staner = sqrt((prop)*(1 - prop) / tot) /*  creates a standard error */
gen prop_min=prop-(1.96*staner) /* lower  95% CI  */
gen prop_max=prop+(1.96*staner) /* upper 95% CI  */
gen prop_exp=ewocc/tot /* expected proportion of combination  */
gen val_min=prop_min/prop_exp /* surplus of combinations, at lower level */
gen val_max=prop_max/prop_exp /* surplus of combinations, at higher level */

***label variables
label variable tot "total number in sample"
label variable nhocc "total number of males in occupation"
label variable nwocc "total number of females in occupation"
label variable phocc "percentage of men in occupation"
label variable pwocc "percentage of women in occupation"
label variable ewocc "expected number of partnerships"
label variable prop "Observed proportion of all ties"
label variable prop_exp "Expected proportion of all ties"
label variable prop_min "Lower observed proportion of all ties"
label variable prop_max  "Upper proportion of all ties"
label variable value "Observed value of representation ratio"
label variable val_min "Lower bound of observed value of representation ratio"
label variable val_max "Higher bound of observed value of representation ratio"

sav $path9\file2.dta, replace   /* temporary copy of the data file */
/* 
* The same calculations can be generated directly using an online command file designed for this purpose:
 do http://www.camsis.stir.ac.uk/sonocs/do/pajek.do
*/




***************************************************************

**** (iii) Exporting data from Stata, selecting ties according to the threshold approach
use $path9\file2.dta, clear
/* Selected threshold: combination probably occurs at least twice as often as would expect if connections 
      were random  (i.e. lower bound of confidence interval for representation ratio exceeds 2) */ 
sum if val_min>=2 /* checks the data for those cases that make the selected threshold */
keep if val_min>=2 /* drops cases which do not meet the selected threshold */
keep hocc wocc freq /* reduces data to core edgelist content */
sav $path9\portugal_t1.dta, replace /* exports data in Stata format (keeping labels etc) */
outsheet  using"$path9\portugal_t1.txt", comma nonames nolabel replace 
/* also exports the data as a text file (text file is convenient if other software is also to be used) */

**************************************************************


*** (iv) Exporting data from Stata, selecting ties according to the popularity approach 
use $path9\file2.dta, clear
/* Selected threshold: combination is one of the three most common for the occupation, and 
    it occurs at least 5 times in the data */ 
gsort +hocc -val_min /* sorts the data by occupation and descending order of the threshold */
bysort hocc: gen num=_n /* within occupations gives a rank to each ego-alter occupational permutation */
sum if num <= 3 & freq >= 5 
keep if num <= 3   & freq >= 5  /* drops all combinations not within the 3 highest connections within occupations */
keep hocc wocc freq /* reduces data to core edgelist content */
sav $path9\portugal_t2.dta, replace 
outsheet  using"$path9\portugal_t2.txt", comma nonames nolabel replace /* exports the data as a text file */

****************************************************



**** (v) Install Stata's 'nwcommands' library to define network structure
* (illustrated below is generic code that installs this extension library)
capture mkdir $path9\stata
capture mkdir $path9\stata\ado 
adopath + "$path9\stata\ado"  /* code to ensure have somewhere suitable for local installation */
net from http://www.nwcommands.org
net set ado "$path9\stata\ado"
net install nwcommands-ado 
***************************************************************



***** (vi) Use the 'nwcommands' extension to chart the network structures 

* Example (1): If combination occurs at least twice as often as expected by chance 
use $path9\portugal_t1.dta, clear
summarize
* Use NWcommands to define the network structure: 
capture nwset, clear /* remove existing networks from memory if relevant */ 
nwset hocc wocc freq, name(occ1) edgelist undirected
nwsummarize
* Use NWcommands to show a sociogram of the structure: 
nwplot(occ1), title("PT 2001: RR >= 2")   lab labelopt(mlabsize(tiny)) layout(mdsclassical) ///
      scatteropt(mfcolor(gs13) mlcolor(gs7) msymbol(circle)) scheme(s1mono) 

* Example (2): Most popular combinations, regardless of RR value, so long as at least 5 instantiations
use $path9\portugal_t2.dta, clear
summarize
* Use NWcommands to define the network structure: 
capture nwset, clear /* remove existing networks from memory if relevant */ 
nwset hocc wocc freq, name(occ1) edgelist undirected
nwsummarize
* Use NWcommands to show a sociogram of the structure (with slightly layout settings): 
nwplot(occ1), title("PT 2001: Popularity threshold (up to 3 most popular)")  ///
    lab labelopt(mlabsize(tiny)) layout(mdsclassical) ///
      scatteropt(mfcolor(green*0.5) mlcolor(gs7) msymbol(circle)) scheme(s1mono) 

***************************************************************







****** (vii) Illustration of comparable analysis using R  

/* within R, code such as the below would generate a network sociogram: 
install.packages("sna")
library(sna)
install.packages("statnet")
library(statnet)
######
## Occ subsets
threshold1 <- read.table("c:/temp/portugal_t1.txt", convert.factors=F)
names(threshold1)
## Threshold1 
adj_source <- data.frame(hocc=as.factor(threshold1$hocc), wocc=as.factor(threshold1$wocc)) 
adj_source2 <- data.frame(hocc=threshold1$hocc, wocc=threshold1$wocc) 
adj1 <- network(adj_source, directed=TRUE, matrix.type="edgelist")
class(adj1)
summary(adj1)
network.size(adj1)
network.edgecount(adj1)
adj1b <- network(adj_source2, directed=TRUE, matrix.type="edgelist")
class(adj1b)
summary(adj1b)
network.size(adj1b)
network.edgecount(adj1b)
cellsize1 <- by(threshold1$freq, list(threshold1$occgb), sum) # Gets values of cellsize
# Graphical presentation of network connections
adj.empty <- network.initialize(5)
par(mfrow=c(1,1))
plot.network(adj1, displaylabels=T, boxed.labels=F, main="Threshold: k=3, c=7" ,  edge.col="grey50" , 
    usearrows=F ,   object.scale=0.0002 ,  vertex.cex=cellsize1, vertex.col="grey80", pad=-1,
      label.cex=0.7, label.pos=2, label.col="grey30")
##
*/
****************************************************

** EOF.