/*
 Demetris Christodoulou
 demetris.christodoulou@sydney.edu.au
 Graph Workflow ©
 www.graphworkflow.com
*/

version 15
cd "/Users/dchristodoulou/Documents/graphworklfow/graphs/benford"
graph set window fontface "Avenir"
graph set window fontfaceserif "Palatino"

clear all

tempfile tben 
set obs 100
generate digits = _n
generate prob  = log10(1+1/digits) 
save `tben' 

clear
import delimited benford_income.csv
tostring income, replace force

generate tag = 1
tempfile t1 t2

// we need access to some other variable variable, here an aritrary tag, in order to use collapse
preserve
 generate digits = substr(income,1,1) 
 destring digits, replace force
 collapse (percent) digits_pc = tag, by(digits)
 save `t1'
restore
preserve
 generate digits = substr(income,1,2) 
 destring digits, replace force
 collapse (percent) digits_pc = tag, by(digits)
 save `t2'
restore
generate digits = substr(income,1,3) 
destring digits, replace force
collapse (percent) digits_pc = tag, by(digits)
keep if digits==100
append using `t1' `t2'
replace digits_pc = digits_pc/100
merge 1:1 digits using `tben'
drop _merge

local ylablist ""
foreach i of numlist 1/10 20(10)100 {
   scalar s`i' = log10(1+1/`i')
   local ylablist `ylablist' `=log10(log10(1+1/`i'))' "`:di %4.2f `=s`i'*100''"
}

generate show_up = inlist(digits,1,10,20,30,40,50,60,70,80,90,100)
generate show_digits_pc = digits_pc*100
tostring show_digits_pc, replace force format(%03.1f)

replace digits_pc = log10(digits_pc)
replace prob = log10(prob)

local xlablist ""
foreach i of numlist 1/10 20(10)100 {
   scalar s`i' = `i'
   local xlablist `xlablist' `=s`i'' "`i'"
}

twoway (line prob digits if digits==., lw(*1.75) lc("164 14 76")) ///
	   (line prob digits if digits==., lw(*1.75) lc("18 78 120")) ///
       (rspike prob digits_pc digits, lc("164 14 76") lw(*.2)) ///
       (scatter digits_pc digits, ms(o) msiz(*.2) mc("164 14 76") mlw(*.5)) ///
	   (line prob digits, lw(*.5) lc("18 78 120")) ///
       (scatter prob digits, ms(o) msiz(*.2) mc("18 78 120") mlw(*.5)) ///
	   (scatter digits_pc digits if show_up, ms(i) ///
	            mlab(show_digits_pc) mlabc("164 14 76") mlabpos(12) mlabsiz(*.65)) ///
	   (scatter prob digits if inrange(digits,1,9) | inlist(digits,10,20,30,40,50,60,70,80,90,100) ///
	    , ms(s) msiz(*.7) mlc("18 78 120") mfc(gs16) mlw(*.5)) ///
	   , ysize(1) xsize(2) ///
	   legend(ring(1) pos(1) col(2) order(1 2) symxsize(*.2) size(*.75) ///
	         region(lc(none) fc(none)) bmargin(t=0 b=0) ///
	         label(1 "Self-disclosure frequency") ///
			 label(2 "Expected disclosure frequency")) ///
	   xtitle("Log of first digits of self-disclosed income", size(*.75) margin(t+2)) ///
	   ytitle("Log-proportions", size(*.75)) ///
	   title("Deviations of self-disclosed income from Benford's Law", size(*.7) ring(1) pos(11)) ///
	   ylabel(`ylablist', labsize(*.45) grid glw(*.25) tlw(*.5) tl(*.5) angle(0)) ///
	   xlabel(`xlablist', labsize(*.55) grid glw(*.25) tlw(*.5) tl(*.5)) ///
	   yscale(lc(none)) xscale(lc(none)) ///
	   plotregion(lc(gs0) lw(*.5) margin(medium)) ///
	   graphregion(fc(gs16) lc(gs0) lw(*.5)) name(g1,replace)
graph export benford_income.png, replace


exit

Note: the data source is commercially sensitive hence why there is no data link
