Dienstag, 14. Mai 2013

holiday dataset for R

 

I work with sales datasets from retail companies. It is important for me to know if a day was a holiday, and to estimate the effect of that holiday on sales.

But also the days before and after a holiday may be informative, that is why I wrote this little routine: It generates an R dataset for the specified years with all the holidays in Austria. It can be easily adjusted to other nations by adopting the “national” vector.

Demetra+ Manual http://www.cros-portal.eu/sites/default/files//Demetra%2B%20User%20Manual%20November%202012.pdf  was the source for the holidays. Demetra+ is a free software for seasonal adjustment, but its finest aggregate is the month – which is to coarse for my use case.

 

makeholidaydataset=function(daysbefore=30,daysafter=7,years=2001:2013){
  #this function generates Austrian holidays dataset usable for regressions; adjust the fixed holidays in fh
  # to adapt for other countries add line to national
 
  library(data.table)
  library(foreign)
 
  #fixed holidays
 
  fh=matrix(c(
  "NewYear",1,1,
  "MayDay" ,5,1,
  "Halloween",10,31,
  "AllSaintsDay",11,1,
  "Christmas",12,25),ncol=3,byrow=T)
  national=matrix(c(
    "Nationalholiday",10,26,
    "StMary",8,15,
    "MariaEmpfaegnis",12,8),ncol=3,byrow=T)
 
  fixedholidays=rbind(fh,national)
  fixedholidays=data.table(holiday=fixedholidays[,1],month=as.integer(fixedholidays[,2]),day=as.integer(fixedholidays[,3]))
 
  library(plyr)
  gendates=function(year){
    dates=as.IDate(ISOdate(year=year,month=fixedholidays$month, day=fixedholidays$day))
    data.table(date=dates,holiday=fixedholidays$holiday)
  }
  DTholidays=data.table(ldply(years,gendates))
   
 
  #variable holidays: easter related
  #easter=read.csv(file="c:/git/da/data/holiday/easter.csv",row.names=NULL)
 
  easter=c("1990-04-15","1991-03-31","1992-04-19","1993-04-11","1994-04-03","1995-04-16","1996-04-07","1997-03-30","1998-04-12","1999-04-04",
           "2000-04-23","2001-04-15","2002-03-31","2003-04-20","2004-04-11","2005-03-27","2006-04-16","2007-04-08",
           "2008-03-23","2009-04-12","2010-04-04","2011-04-24","2012-04-08","2013-03-31","2014-04-20","2015-04-05",
           "2016-03-27","2017-04-16","2018-04-01","2019-04-21")
  easterdates=as.IDate(easter)
  eastermonday=easterdates+1
  ashwednesday=easterdates-46
  ascensionday=easterdates+40
  pentecost=easterdates+50
  whitmonday=pentecost+1
  maundythursday=easterdates-3
  goodfriday=easterdates-2
 
variableholidays= rbind(
    data.table(holiday="eastersunday",date=easterdates),
  data.table(holiday="eastermonday",date=eastermonday),
  data.table(holiday="ashwednesday",date=ashwednesday),
  data.table(holiday="ascensionday",date=ascensionday),
  data.table(holiday="pentecost",date=pentecost),
  data.table(holiday="whitmonday",date=whitmonday),
  data.table(holiday="maundythursday",date=maundythursday),
  data.table(holiday="goodfriday",date=goodfriday)
  )
 
  allholidays=rbind(DTholidays,variableholidays,use.names=T)
  allholidays$value=1
 
  #generate before and after window
  library(reshape2)
 
  A=acast(allholidays,date~holiday,fun.aggregate=sum)
  DATES=as.IDate(rownames(A))
  offsets=-daysbefore:daysafter
 
  returnoffseteddswithdate=function(offset){
    d=DATES+offset
    class(d)=c("IDate" ,"Date" )
   
    B=copy(A)
    rownames(B)=as.character(d)
    B[B==1]=offset
    B=as.data.frame(B)
    B$date=as.IDate(rownames(B))
    B
  }
  all=data.table(ldply(offsets,returnoffseteddswithdate))
  m=melt(all,id.vars="date")
  m=data.table(m)
  m=m[,list(value=sum(value)),by=list(date,variable)]
  all=data.table(dcast(m,date~variable,fun.aggregate=sum))
  all=all[order(date)]
  all
}

DTholiday=makeholidaydataset()
save(DTholiday,file="holidays.rdata")

easter=DTholiday[eastersunday!=0,list(date,eastersunday)]
setkey(easter,date)
DTisholiday=makeholidaydataset(daysbefore=0,daysafter=0)[,list(date,isholiday=1)]
setkey(DTisholiday,date)
myholidays=merge(DTisholiday,easter,all=T)