Plot SNP array raw data

In addition to genotyping, SNP arrays are good tools for copy number calling. To do that, we need to take into account not the individual genotypes of each SNP in the array, but the general shape and values of the raw data. Good tools exist for that, but visualizing the raw data is an important step and a quality control.

In this example we’ll use data from a cancer cell line obtained with an Illumina OmniExpress array. The data file has been downsampled to only 100k snps, but other from that is the standard “FinalReport” one gets from GenomeStudio.

library(karyoploteR)
dd <- read.table(file = "Examples/data/SNPArray.26T.txt.gz", sep="\t", header=TRUE, stringsAsFactors = FALSE)
head(dd)
##     SNP.Name Sample.ID Allele1...Top Allele2...Top GC.Score Sample.Name
## 1 rs11152336        34             A             C   0.7622         26T
## 2  rs4458740        34             A             C   0.8454         26T
## 3  rs4264565        34             C             C   0.8317         26T
## 4  rs4474684        34             G             G   0.7729         26T
## 5  rs7677662        34             A             A   0.7739         26T
## 6  rs4682434        34             A             C   0.4395         26T
##   Sample.Group Sample.Index SNP.Index SNP.Aux Allele1...Forward
## 1          26T           32     81073       0                 A
## 2          26T           32    443708       0                 A
## 3          26T           32    434421       0                 C
## 4          26T           32    444477       0                 C
## 5          26T           32    613842       0                 A
## 6          26T           32    456548       0                 T
##   Allele2...Forward Allele1...Design Allele2...Design Allele1...AB
## 1                 C                A                C            A
## 2                 C                T                G            A
## 3                 C                G                G            B
## 4                 C                C                C            B
## 5                 A                A                A            A
## 6                 G                A                C            A
##   Allele2...AB Allele1...Plus Allele2...Plus Chr  Position GT.Score
## 1            B              A              C  18  59864223   0.7679
## 2            B              A              C   7 156089537   0.8203
## 3            B              C              C   2 215283259   0.8107
## 4            B              C              C  16    951045   0.7734
## 5            A              A              A   4 145513053   0.7741
## 6            B              T              G   3 112534069   0.7903
##   Cluster.Sep   SNP ILMN.Strand Customer.Strand Top.Genomic.Sequence
## 1      0.7687 [A/C]         TOP             TOP                   NA
## 2      0.7304 [T/G]         BOT             TOP                   NA
## 3      1.0000 [T/G]         BOT             TOP                   NA
## 4      0.9690 [T/C]         BOT             BOT                   NA
## 5      0.9954 [A/G]         TOP             TOP                   NA
## 6      0.6738 [A/C]         TOP             BOT                   NA
##   Plus.Minus.Strand Theta     R     X     Y X.Raw Y.Raw B.Allele.Freq
## 1                 + 0.651 0.642 0.243 0.398  1266  1753        0.6157
## 2                 - 0.615 1.137 0.465 0.672  2350  2903        0.4573
## 3                 - 0.972 1.266 0.053 1.213   486  5697        1.0000
## 4                 + 0.974 1.161 0.046 1.115   432  4710        1.0000
## 5                 + 0.033 1.369 1.300 0.068  6213   460        0.0000
## 6                 - 0.330 0.465 0.296 0.169  1744   866        0.3379
##   Log.R.Ratio CNV.Value CNV.Confidence GC_probe GCC_LRR
## 1     -0.3307        NA             NA     0.31   -0.26
## 2      0.1754        NA             NA     0.44    0.18
## 3     -0.0784        NA             NA     0.31   -0.01
## 4     -0.1139        NA             NA     0.62   -0.15
## 5     -0.0547        NA             NA     0.36   -0.01
## 6     -0.1655        NA             NA     0.30   -0.10

to plot it we’ll first extract the information we need and convert it into a GRanges using the toGRanges function from regioneR and setting start and end to the snp position. The data we need is the B-allele frequency (BAF), that is, the portion of the signal coming from the allele labeled as B, and the Log R Ratio (LRR), that is, the total amount of signal. We’ll need to change the chromosome names from the Ensembl style to the UCSC style, since we are using the standard hg19 genome.

snp.data <- toGRanges(dd[,c("Chr", "Position", "Position", "B.Allele.Freq", "Log.R.Ratio")])
names(mcols(snp.data)) <- c("BAF", "LRR")
seqlevelsStyle(snp.data) <- "UCSC"
head(snp.data)
## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames    ranges strand |       BAF       LRR
##          <Rle> <IRanges>  <Rle> | <numeric> <numeric>
##   [1]    chr18  59864223      * |    0.6157   -0.3307
##   [2]     chr7 156089537      * |    0.4573    0.1754
##   [3]     chr2 215283259      * |         1   -0.0784
##   [4]    chr16    951045      * |         1   -0.1139
##   [5]     chr4 145513053      * |         0   -0.0547
##   [6]     chr3 112534069      * |    0.3379   -0.1655
##   -------
##   seqinfo: 26 sequences from an unspecified genome; no seqlengths

And we are ready to plot. We’ll create a scatter plot using the kpPoints function and we’ll plot the BAF in the upper part of the data panel (r0=0.5 and r1=1) and the LRR in the lower part (r0=0 and r1=0.5).

kp <- plotKaryotype()
kpPoints(kp, data=snp.data, y=snp.data$BAF, r0=0.5, r1=1)
kpPoints(kp, data=snp.data, y=snp.data$LRR, r0=0, r1=0.5)

plot of chunk Figure1

There are a few problems with this plot. First, we have BAF is in the [0,1] range but LRR is not, and it’s bleeding over the ideograms. Second, the points are too big to reveal the patterns, specially with the chromosomes one over the other. To solve this, we’ll change to another plot type with all chromosomes in a line, plot.type=4. We’ll deal with the overflowing LRR by setting a different ymin and cutting the points below that.

lrr.below.min <- which(snp.data$LRR < -3)
snp.data$LRR[lrr.below.min] <- -3

kp <- plotKaryotype(plot.type = 4)
kpPoints(kp, data=snp.data, y=snp.data$BAF, cex=0.3, r0=0.5, r1=1)
kpPoints(kp, data=snp.data, y=snp.data$LRR, cex=0.3, r0=0, r1=0.5, ymax=2, ymin=-3)

plot of chunk Figure2

It is now possible to see the patterns of alteration, both in the BAF and the LRR. We can further refine the plot by reducing the margin between the data and the ideograms and changing the ideograms with a single line with the centromere.

pp <- getDefaultPlotParams(plot.type = 4)
pp$data1inmargin <- 2
kp <- plotKaryotype(plot.type = 4, ideogram.plotter = NULL, plot.params = pp)
kpAddCytobandsAsLine(kp)
kpPoints(kp, data=snp.data, y=snp.data$BAF, cex=0.3, r0=0.5, r1=1)
kpPoints(kp, data=snp.data, y=snp.data$LRR, cex=0.3, r0=0, r1=0.5, ymax=2, ymin=-3)

plot of chunk Figure3

Finally, we’ll mark in red the points below ymin to mark they were cropped, we’ll add markers for a couple of genes, add axis were needed and a line marking the 0 LRR. The information on the position of the genes will come from Ensembl Biomart as in the Plot Genes example.

library(biomaRt)
gene.symbols <- c("AKT", "APC", "BCR", "BIRC3", "BRAF", "BRCA1", "BRCA2", "CDKN2C", "FEV", "TP53", "PTEN", "RB1")
ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", version=67)
genes <- toGRanges(getBM(attributes=c('chromosome_name', 'start_position', 'end_position', 'hgnc_symbol'),
               filters = 'hgnc_symbol', values =gene.symbols, mart = ensembl))
seqlevelsStyle(genes) <- "UCSC"


pp <- getDefaultPlotParams(plot.type = 4)
pp$data1inmargin <- 2

kp <- plotKaryotype(plot.type = 4, ideogram.plotter = NULL, plot.params = pp)
kpAddCytobandsAsLine(kp)
kpAxis(kp, r0=0.4, r1=0.75)
kpPoints(kp, data=snp.data, y=snp.data$BAF, cex=0.3, r0=0.4, r1=0.75)
kpAxis(kp, tick.pos = c(-3, 0, 2), r0=0, r1=0.35, ymax=2, ymin=-3)
kpPoints(kp, data=snp.data, y=snp.data$LRR, cex=0.3, r0=0, r1=0.35, ymax=2, ymin=-3)
kpPoints(kp, data=snp.data[lrr.below.min], y=snp.data[lrr.below.min]$LRR, cex=0.3, r0=0, r1=0.35, ymax=2, ymin=-3, col="red")
kpAbline(kp, h=0, r0=0, r1=0.35, ymax=2, ymin=-3, col="blue")
kpPlotMarkers(kp, data=genes, labels=genes$hgnc_symbol, line.color = "#555555", marker.parts = c(0.95,0.025,0.025),  r1=1.05)

plot of chunk Figure4