A friend recently posted "Is it weird that at least 3 of my friends have their 3rd anniversary today?". I was curious. So, I constructed a Monte Carlo simulation to determine if it was, in fact, "weird". Short answer: no (p ≈ 0.21).

If we assume that that 14% of
weddings happen in June, 75% of those on a Saturday, and that
relationship lengths are exponentially distributed with a mean of 8
years, then a frequency distribution for the number of independent couples out of a population of 649 having 2009-06-13 as their anniversary is:

1: 35.5%

2: 23.2%

3: 14.5%

4: 4.7%

5: 1.9%

6: 0.3%

The R code I used for this was:

# Probabilities of weddings in each month

# Data for weddings in the Netherlands in 1995 taken from

# http://g2.cvs.sourceforge.net/viewvc/g2/g2/g2/demo/bargraph/graphdata.py

pr_month = c(

2947, 3121, 4514,

5708, 9794, 12427,

6825, 8630, 12108,

6411, 4045, 4939 )

pr_month = pr_month/sum(pr_month)

# Unfortunately, I'm not able to find data on which week days are most common.

# All I've found is that Saturday is 'by far the most common', followed by

# Sunday, with weekday weddings being less common.

pr_day = list()

pr_day[["Monday" ]] = 0.01

pr_day[["Tuesday" ]] = 0.01

pr_day[["Wednesday"]] = 0.01

pr_day[["Thursday" ]] = 0.01

pr_day[["Friday" ]] = 0.01

pr_day[["Saturday" ]] = 0.75

pr_day[["Sunday" ]] = 0.20

# First marriages which end in divorce last an average of 8 years

# (see http://www.census.gov/prod/2005pubs/p70-97.pdf )

# I'll assume marriage lengths follow an exponential distribution with mean 8

# For performance, cache things we will re-use

month_days = list()

day_probs = list()

# Define a function which can generate a specified number of wedding dates,

# following the probabilities given above:

random_dates = function(n){

dates = rep(as.Date(NA), n)

ages = sort(floor(rexp(n, 1/8)),decreasing=T) # Sort to get good cache performance

month = sample(1:12, n, prob=pr_month, replace=T)

for (i in 1:length(ages)){

m_ix = sprintf('%i-%i-1',2012-ages[i],month[i])

if (is.null(month_days[[m_ix]])){

# Get a list of the days in the month

start = as.Date(m_ix)

stop = as.Date( ifelse(month[i]==12,

sprintf('%i-1-1', 2012-ages[i]+1),

sprintf('%i-%i-1', 2012-ages[i], month[i]+1)) )

month_days[[m_ix]] <<- head(seq(start, stop, by='day'),-1)

}

if (is.null(day_probs[[m_ix]])){

# Figure probabilities for each day

day_prob = rep(NA, length(month_days[[m_ix]]))

for (j in 1:length(day_prob)){

day_prob[j] = pr_day[[ weekdays(month_days[[m_ix]][j]) ]]

}

day_probs[[m_ix]] <<- day_prob/sum(day_prob)

}

# Pick a day out of this month

dates[i] = sample(month_days[[m_ix]], 1, prob=day_probs[[m_ix]] )

}

dates

}

# Generate 1000 sets of 649 independent dates

# Figure out how many times out of those 1000 simulations

# there are more than three which fall on 13 June 2009.

count=rep(0,1000)

for (i in 1:1000){

# Progress meter:

if (i%%10==0) cat (i)

else if (i%%2==0) cat ('.')

if (i%%50==0) cat('\n')

count[i] = sum( random_dates(649) == as.Date('2009-06-13'))

}

cat('\n')

# Generate a frequency distribution

for( i in 1:max(count)){

cat(sprintf('%i: %.1f\n', i, 100*sum(count==i)/length(count)))

}

## No comments:

Post a Comment