\documentstyle[fancybox,sem-a4,semrot,palatino,epsf,boldvectors]{seminar}

\renewcommand{\S}{{\sf S}}
\newcommand{\oswald}{{\sf Oswald}}

\def\printlandscape{\special{landscape}}    % Works with dvips.

\newpagestyle{327}%
  {\oswald \hspace{\fill}\rightmark \hspace{\fill}\thepage}{}%
\pagestyle{327}

\articlemag{1}

%\markright{Hello}

\slideframe{Oval}

\newcommand{\heading}[1]{%
  \begin{center}
    \large\bf
    \shadowbox{#1}%
  \end{center}
  \vspace{1ex minus 1ex}}

\begin{document}

\begin{slide}
  \begin{center}
  \heading{\Large\sf OSWALD}
  
  Object-oriented SoftWare for the Analysis of
  Longitudinal Data

  \bigskip
  
  {\bf Dave M. Smith}\\
  Department of Mathematics and Statistics\\Lancaster University, UK\\
  {\tt D.M.Smith@lancaster.ac.uk}

  \bigskip

  \bf{November 1, 1995}

  \bigskip
  
  {\small \tt http://www.maths.lancs.ac.uk:2080/$\sim$maa036/Oswald/}
  \end{center}

\end{slide}

\begin{slide*}{}
  \heading{What is \oswald ?}

  \begin{itemize}
    \item {\bf \underline{O}bject-oriented \underline{S}oft\underline{w}are}
      \begin{itemize}
        \item An extension to the \S\ language
        \item Data types for longitudinal data
          \begin{itemize}
            \item time series
            \item collections of time series
          \end{itemize}
        \item Operations on data types
        \item Syntax for describing suitable linear models
      \end{itemize}


    \item {\bf \underline{A}nalysis of \underline{L}ongitudinal \underline{D}ata}
      \begin{itemize}
        \item Reading in data
        \item OLS analysis (ignoring correlation)
        \item GEE and ALR
        \item Parametric models for correlation
          \begin{itemize}
          \item Variogram estimation
          \end{itemize}
        \end{itemize}
        \item Plan to implement all in Diggle, Liang \&\ Zeger (1994)
      \end{itemize}
\end{slide*}

\begin{slide}{}
  \heading{What is \S ?}

  \begin{itemize}
  \item ``A programming environment for data analysis and graphics''
  \item Extensible interpreted functional language
  \item Object-oriented features
    \begin{itemize}
      \item Create new data types
      \item Extend existing functions
      \item Create new functions
    \end{itemize}
  \item Sophisticated graphical and statistical features
  \item Available for Windows, Unix and other platforms
  \end{itemize}
  
  Using \S\ as a base allows us to build on an existing and familiar
  user interface.
\end{slide}

\begin{slide}{}
  \heading{What are longitudinal data?}
  
  \begin{itemize}
    \item ``Replicated time series''

    \item {\bf Time series}: Observations on a single subject taken at
      a number of points in time.
      \begin{itemize}
        \item Not necessarily equally spaced

        \item Possibly distinct covariates at each time point
      \end{itemize}

    \item {\bf Longitudinal data}: Time series taken over a number of
      subjects
      \begin{itemize}
        \item Covariates on each subject (e.g.\ treatment groups)

        \item Generally a large number of short time series

        \item Time points do not need to be the same for all subjects
  
        \item Observations within a time series are correlated, but
          assume observations from different subjects are independent
    \end{itemize}
  \end{itemize}
\end{slide}

%Example S script demonstrating some simple Oswald features

\begin{slide}
  \heading{Data structures in Oswald}
  
  \oswald\ provides data structures (`{\bf objects}') both for single
  time series and for collections of time series (longitudinal data)

  Objects designed to look like ordinary \S\ objects: time series are
  vector-ish, LDA objects are matrix-ish.

  Method functions provided to do `sensible' things for common
  operations.
\end{slide}

\begin{slide*}{}
  \heading{Single time series}

  \begin{itemize}
  \item {\tt ldats} objects

  \item Ordered sequence of (time, observation) pairs

  \item Created from a vector of time values and a vector of
    observation values

  \item Time points need not be uniformly spaced

  \item Operations include:
    \begin{itemize}
      \item Creation: {\tt ts <- ldats(y,t)}
      \item Sub-selection: {\tt ts[1:10]}
      \item Comparison: {\tt ts[ts>0]}
      \item Transformation: {\tt log(ts)}
      \item Printing: {\tt print(ts)}
      \item Plotting: {\tt plot(ts)}
    \end{itemize}
  \end{itemize}
\end{slide*}

\begin{slide}
  \heading{Example}
{\small \begin{verbatim}
> cow52[1:10]
      [1]  [2]  [3]  [4]  [5]  [6]  [7]  [8]  [9] [10] 
TIME    1    2    3    4    5    6    7    8    9   10
 OBS 3.79 4.07 3.35 3.46 3.20 3.99 3.54   NA 3.25 3.30
> plot(cow52)
\end{verbatim}}
  \setslidelength{\epsfxsize}{6in}
  \centerline{\epsffile{Comp/cow52.ps}}
\end{slide}

\begin{slide*}
  \heading{Collections of time series}
  
  \begin{itemize}
  \item longitudinal data

  \item When time points are common for all subjects (or nearly so)
    the series may be represented as rows of a matrix ({\tt balanced} objects)

  \item If time points are not common across subjects (completely
    unbalanced data) a different data structure exists

  \item Information on time points and possibly treatment structure
    may also be accommodated in the data structures

  \item Operations include:
    \begin{itemize}
      \item Creation: {\tt ld <- balanced(X,t)}, or\\
        {\tt ld <- read.balanced("file")}
      \item Sub-selections: {\tt ld[,1:5]}
      \item Transformations: {\tt log(ld)}
      \item Printing: {\tt print(ld)}
      \item Plotting: {\tt plot(ld)}
    \end{itemize}
  \end{itemize}
\end{slide*}

\begin{slide}{}
  \heading{Displaying collections of time series}

  \begin{itemize}
  \item Typically many points $\Rightarrow$ difficult to display
    meaningfully on a single plot

  \item Need to capture {\em individual\/} (within-series) behaviour
    as well as {\em group\/} (across series) behaviour

  \item May also wish to discriminate between certain groups
    (e.g. treatment groups) $\rightarrow$ {\tt groups} attribute

  \item {\bf Strategy 1}: Plot most series as ``shadows'' (impression
    of variability); plot a few series series from each group in full
    (typical individual behaviour)

  \item {\bf Strategy 2}: Average across subjects within groups at
    each time point (group behaviour)
  \end{itemize}
\end{slide}

\begin{slide*}
  \heading{The Milk Data}

  Example data set, first analysed in Diggle (Biometrics, 1988).  Data
  consists of weekly measurements of protein content from milk samples
  \begin{itemize}
    \item 79 cows assigned to 3 diets (1--25 Barley; 26--52 Mixed;
      53--79 Lupins)
    \item from 12 to 19 observations per cow
    \item Times in weeks since calving; experiment terminated at a
      fixed date
  \end{itemize}

{\small \begin{verbatim}
> summary(milk)
Balanced LDA matrix:
Number of time series: 79 
Number of observations per series: 19 
Total number of observations: 1501
                     (164 missing)
\end{verbatim}}
\end{slide*}


\begin{slide}
  \heading{Milk data plots}

  Within-group averages:
{\small\begin{verbatim}
> plot(olsfit(milk)
> ldamat.legend(olsfit(milk),locator(1))
\end{verbatim}}

  \setslidelength{\epsfxsize}{6in}
  \centerline{\epsffile{Comp/milkfit.ps}}

  `Shadow' plot:\\
  {\tt > plot(milk, p=0.2)}
\end{slide}
% Separate transparency goes here

\begin{slide}{}
  \heading{A parametric model for correlation}

  Proposed by Diggle (Biometrics, 1988).

  Marginally: $\rm{E}[\vec{Y}] = X \vec{\beta}$

  Correlation within series comprises three components:
  \begin{itemize}
    \item Serial correlation (realisation of a stationary Gaussian
      process with variance $\sigma^2$ and correlation function
      $\rho(u,\phi)$)
    \item A random intercept for each subject (with variance $\nu
      \sigma^2$)
    \item Measurement error (realised independently for each
      observation, with variance $\tau \sigma^2$)
  \end{itemize}
  
  Initial estimates of the variance parameters are required, and may
  be obtained from the sample variogram.
  Mean and variance parameters are then estimated using REML.
\end{slide}

\begin{slide*}
  \heading{The Variogram}

For a stationary process, $Y(t)$ say, the variogram is the function
\[
  \gamma(u) = \frac{1}{2}{\rm E}[\{Y(t) - Y(t-u)\}^2]
\]
For our model:
\[
  \vec{Y}_i = \vec{\mu}_i + \vec{W}_i + \vec{1} U_i + \vec{E}_i
\]
where
\begin{eqnarray*}
  \vec{W}_i & \sim & \mbox{N}(\vec{0}, H),~\mbox{where}~
  H=[\rho(|t_{ij}-t_{ik}|, \phi)] \\
  U_i & \sim & \mbox{N}(0, \sigma^2 \nu) \\
  \vec{E}_i & \sim & \mbox{N}(\vec{0}, \sigma^2 \tau I)
\end{eqnarray*}

Under this model, it may be shown that $\vec{Y} - \vec{\mu}$ has the
variogram
\[ \gamma(u) = \sigma^2(\tau + 1 - \rho(u)) \]
and so an estimate of the variogram allows us to get first estimates
of the variance parameters.
\end{slide*}

\begin{slide}
  \heading{Calculating the variogram}
  To calculate the variogram $\gamma(u)$, average over within-series
  pairs at lag $u$ using detrended data.  Detrending is accomplished
  by taking the residuals from a saturated groups$\times$times model.
{\small \begin{verbatim}
> res <- olsres(milk); vg <- variogram(res)
> plot(vg)
\end{verbatim}}
  \setslidelength{\epsfxsize}{6in}
  \centerline{\epsffile{Comp/milkvg.ps}}
\end{slide}

\begin{slide}{}
  \heading{Parameter estimation}

  To estimate the parameters in the model, the REML likelihood is
  maximised given initial values of the variance parameters.

  \begin{itemize}
  \item Mean model specified using a familiar {\sf Glim}-style
    notation.  Explanatory variables are in terms of a column vector
    of responses (stacked $\vec{Y}_i$).
  \item Special support for time-dependent variables ({\tt tex()}) and
    subject-specific variables ({\tt gex()}).
  \item Missing values are allowed, but ignored.
  \item Initial values for variance parameters are specified in the
    call to {\tt reml.fit}
  \end{itemize}
\end{slide}

\begin{slide*}{}
\ptsize{8}
{\small \begin{verbatim}
Longitudinal Data Analysis Model

Call:
reml.fit(formula = milk ~ group + tex(x1) + 
         tex(x2), vparms = c(0.1, 0.5, 0.05))

Analysis Method: Restricted ML (REML) 
Correlation structure:  gaussian 

Maximised likelihood:
      REML 
 -2871.979

Mean Parameters:
          (Intercept) groupLupins  groupMixed 
PARAMETER  4.15882994 -0.21219395 -0.10293290
STD.ERROR  0.05270437  0.05138754  0.05133712
               tex(x1)      tex(x2) 
PARAMETER  -0.22713324 -0.000936491
STD.ERROR   0.01475091  0.003068262

Variance Parameters:
    sigmasq        nu       tau        phi 
 0.05444973 0.1673984 0.6158832 0.03318887

Numbers:
 Subjects Times Missing 
       79    19     164

Iteration successfully converged.
\end{verbatim}}
\end{slide*}

\begin{slide}{}
  \heading{Generalised Estimating Equations}

  \begin{itemize}
  \item Proposed by Liang \&\ Zeger (Biometrika, 1986)
    
  \item Generalised linear models for longitudinal data

  \item Specify the mean model as usual; specify the within-series
    correlation as a ``working correlation matrix''

  \item Several choices of correlation structure, including complete
    independence, stationary Markov process, exchangeable and
    unspecified.

  \item Estimates for mean parameters and their standard errors are
    consistent, even if the within-series correlation structure is
    incorrectly specified
  \end{itemize}
\end{slide}

\begin{slide}{}
  \heading{Example: Seizure data}
  
  \begin{itemize}
  \item Clinical trial of 59 epileptic patients
  \item Patients randomised to anti-epileptic drug or placebo
  \item Measurement is seizures in four consecutive two-week periods
  \item Model as overdispersed Poisson, with baseline seizures as
    covariate
  \end{itemize}

  The actual computation is done with the {\tt gee.fit} function,
  which has an interface similar to {\tt reml.fit} and {\tt glm}.
\end{slide}

\begin{slide*}
\ptsize{8}
{\small \begin{verbatim}
> gee.fit(seizure ~ tex(in.trial)*group, family=poisson,
+   offset=tex(log(obs.period)), corstr="exchangeable")
 GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
 gee S-function, version 3.11 modified 94/05/10 (1994) 

Model:
 Link:                      Logarithm 
 Variance to Mean Relation: Poisson 
 Correlation Structure:     Exchangeable 

Summary of Residuals:
       Min        1Q    Median       3Q      Max 
 -24.78571 -6.607143 -3.967742 2.032258 119.3548

Coefficients:
                       Estimate Naive S.E.   
        (Intercept)  3.42705076  0.1500690  
      tex(in.trial) -1.27445834  0.1530349  
              group  0.02753449  0.2056929  
tex(in.trial):group -0.10472579  0.2176669  

           Naive z Robust S.E.    Robust z
        22.8364943   0.1573571  21.7788059
        -8.3278955   0.1159304 -10.9933020
         0.1338622   0.2217878   0.1241479
        -0.4811286   0.2134448  -0.4906459

Estimated Scale Parameter:  19.41286
Number of Iterations:  1

Working Correlation
          [,1]      [,2]      [,3]      [,4]      [,5] 
[1,] 1.0000000 0.7766877 0.7766877 0.7766877 0.7766877
\end{verbatim}}
\end{slide*}

\begin{slide}{}
  \heading{Other features}

  \begin{itemize}
  \item Interactive plots ({\tt identify})
    
  \item ALR: Alternating Logistic Regressions.  GEE2 for binary data
    with parametric model for odds-ratios (only simple models allowed
    at the moment)
  \end{itemize}
  
  \heading{Future work}
  \begin{itemize}
    \item Data input from other packages

    \item Informative missing values methods (Diggle \&\ Kenward, Applied
      Statistics 1994)
    
    \item Generalisation of GEE for unbalanced data
  \end{itemize}
\end{slide}

\end{document}



%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: 

