上了几天的课,http://ibw2011.fmmu.edu.cn/schedule.htm 今天就上完了,只完成了project 1,想写gibbs sampling,但是没搞明白,汗。

这个纯属练习用,没啥实用价值。

Course Projects:

Project 1: Implementation of a simple gene finder

GOAL

Build a simple codon-usage based gene finder for finding genes in E.coli.

Procedure

Collect 100 gene sequences from the bacterium E. coli in the genbank (http://www.ncbi.nlm.nihh.gov). Compute the codon usage table based on these genes (and the translated protein sequences from them); Build a probabilistic model based on the codon usages; Implement a random sequence model in which the nucleotide frequency is computed from the 100 E. coli genes. For a given DNA sequence (and one selected reading frame), compare your model with a random sequence model; Results that you should submit:

Two FASTA files for the collected 100 genes and 100 translated protein sequences; The printed codon usage table; A program named ECgnfinder, running with the syntax as ECgnfinder –i inputfile

Inputfile stands for the name of input file, which should contain one DNA sequence in FASTA file format; the program should be able to report an error message if the input file is in the wrong format.

The output should be printed to the standard output as (xxx stands for the likelihood)

ORF1: xxx ORF2: xxx


###########################################################
##                                                       ##
##  Dragonstar Course Project (IBW 2011)                 ##
##                                                       ##
##  Project 1: Implementation of a simple gene finder    ##
##  Author Guangchuang your                              ##
##  Xi'an (2011-07-08)                                   ##
##                                                       ##
###########################################################



##' .. content for \description{} (ECgnfinder: An R program to predict prokaryotic genes (e.g. E.coli) using 1st Markov Chain) ..
##'
##' .. content for \details{} ..
##' @title ECgnfinder
##' @param traningSequence : A FASTA file containing multiple sequences for training set
##' @param testingSequence : A FASTA file containing multiple sequences for testing
##' @param backgroundSequence : A FASTA file containing multiple sequences for building the background model
##' @return log likelihood ratio
##' @author Guangchuang Yu
ECgnfinder <- function(trainingSequence, testingSequence, backgroundSequence, print=FALSE) {
    print ("reading training sequence file...")
    trainingFasta <- readFASTA(trainingSequence)
    trainSeq <- sapply(trainingFasta, function(i) i$seq)

    print("building Transition Probability Matrix...")
    TPM <- getTPM(trainSeq)

    print("building codon usage table...")
    codonUsage <- getCodonUsage(trainSeq)

    print("building NucleoTide table...")
    NtUsage <- getNtUsage(backgroundSequence)


    print("reading testing sequence file...")
    testing <- readFASTA(testingSequence)
    testingSeq <- sapply(testing, function(i) i$seq)

    x <- lapply(testingSeq, logLikelihood, transition=TPM, codonusage=codonUsage, background=NtUsage)
    names(x) <- paste("Sequence", seq_along(testingSeq), sep="")
    if (print) {
        x
    }
    return(x)
}


##' .. content for \description{} (read FASTA file) ..
##'
##' .. content for \details{} ..
##' @title readFASTA
##' @param file: A FASTA file
##' @return a list of list which containing desc and seq.
##' @author Guangchuang Yu
readFASTA <- function(file) {
    x=scan(file, what="", sep="\n")
    idx <- grep(">", x)
    fasta <- list()
    for (i in 1:length(idx)) {
            desc <- x[idx[i]]
            if (i == length(idx)) {
                stop <- length(x)
            } else {
                stop <- idx[i+1]-1
            }
            seq.idx <- seq(idx[i]+1, stop)
            s <- x[seq.idx]
            s <- paste(s, collapse="")
            ss <- list(desc=desc, seq=s)
            fasta[i] <- list(ss)
    }
    return(fasta)
    }


##' Build a transition probability matrix from the given Sequence set.
##'
##' @title getTPM
##' @param sequences : A vector of Sequences.
##' @return transition probability matrix
##' @author Guangchuang Yu
getTPM <- function(sequences) {
    codonLength <- 3
    codons <- getCodon()
    
    ## calculate a transition count matrix
    transition <- matrix(1, nrow=length(codons), ncol=length(codons))  ## pseudo count
    colnames(transition) <- codons
    rownames(transition) <- codons

    for (x in 1:length(sequences)) {
        s <- sequences[x]
        len <- nchar(s)
        ## drop the last alphabets which is too short (

        idx <- seq(1, len, codonLength)
        for (i in seq_along(idx)) {
            ## codonI--codonJ
            codonI <- substr(s,idx[i], idx[i]+codonLength-1)
            if (i != length(idx)) {
                codonJ <- substr(s,idx[i+1], idx[i+1]+codonLength-1)
            }
            if (codonI %in% codons & codonJ %in% codons) {
                transition[codonI, codonJ] <- transition[codonI, codonJ] +1
            } else {
                warning("codon ", codonI, " or ", codonJ, " which does not defined was detected...")
            }
        }
    }

    ## Calculate Transition Probability Matrix..
    rowsum <- rowSums(transition)
    TPM <- transition/rowsum
    return(TPM)
}

getCodon <- function() {
    codons <- c("TTT", "TTC", "TTA", "TTG",
                "CTT", "CTC", "CTA", "CTG",
                "ATT", "ATC", "ATA", "ATG",
                "GTT", "GTC", "GTA", "GTG",
                "TCT", "TCC", "TCA", "TCG",
                "CCT", "CCC", "CCA", "CCG",
                "ACT", "ACC", "ACA", "ACG",
                "GCT", "GCC", "GCA", "GCG",
                "TAT", "TAC", "TAA", "TAG",
                "CAT", "CAC", "CAA", "CAG",
                "AAT", "AAC", "AAA", "AAG",
                "GAT", "GAC", "GAA", "GAG",
                "TGT", "TGC", "TGA", "TGG",
                "CGT", "CGC", "CGA", "CGG",
                "AGT", "AGC", "AGA", "AGG",
                "GGT", "GGC", "GGA", "GGG"
                )
    return(codons)
}


##' .. content for \description{} (estimate codon usage table based on training sequences) ..
##'
##' .. content for \details{} ..
##' @title getCodonUsage
##' @param sequences : A vector of sequence set.
##' @return A data.frame containing two columns of codons and Freq (frequency probabilities)
##' @author Guangchuang Yu
getCodonUsage <- function(sequences) {
    codons <- getCodon()
    codon.table <- data.frame(codons=codons, Freq=rep(0, length(codons)))
    rownames(codon.table) = codons

    for (i in 1:length(sequences)) {
        codon <- codonCount(sequences[i])
        rowname.idx <- as.character(codon[,1])
        codon.table[rowname.idx,2] <- codon.table[rowname.idx,2] + codon[,2]
    }
    codon.table[,2] <- codon.table[,2]/sum(codon.table[,2])
                                        #print(codon.table)  #### print result.
    return(codon.table)
}

##' .. content for \description{} (no empty lines) ..
##'
##' .. content for \details{} ..
##' @title codonCount
##' @param seqStr : A sequence string
##' @return A data.frame containing two columns of codons and their counts
##' @author Guangchuang Yu
codonCount <- function(seqStr) {
    codonLength=3
    len <- nchar(seqStr)
    ## drop the last alphabets which is too short (

    idx <- seq(1, len,codonLength)
    codons <- c()
    for ( i in 1:length(idx)) {
        codon <- substr(seqStr, idx[i], idx[i]+codonLength-1)
        codons <- c(codons, codon)
    }
    codons.df <- as.data.frame(table(codons))
    return(codons.df)
}

##' .. content for \description{} (reverse complementary sequence) ..
##'
##' .. content for \details{} ..
##' @title revcom
##' @param seqStr : A sequence string
##' @return A reverse complementary sequence
##' @author Guangchuang Yu
revcom <- function(seqStr) {
    seqStr.str <- unlist(strsplit(seqStr, split=""))
    idxT <- which(seqStr.str=="T")
    idxG <- which(seqStr.str=="G")
    idxA <- which(seqStr.str=="A")
    idxC <- which(seqStr.str=="C")
    seqStr.str[idxT] <- "A"
    seqStr.str[idxA] <- "T"
    seqStr.str[idxG] <- "C"
    seqStr.str[idxC] <- "G"
    seqStr.rc <- rev(seqStr.str)
    seqStr.rc <- paste(seqStr.rc, collapse="")
    return(seqStr.rc)
}

##' .. content for \description{} (calculate NT frequency probabilities of a given sequence set) ..
##'
##' .. content for \details{use to estimate the background/random NT usage} ..
##' @title getNtUsage
##' @param file : A FASTA file
##' @return A data.frame containing NTs and their corresponding frequency probabilities.
##' @author Guangchuang Yu
getNtUsage <- function(file) {
    print("reading FASTA file")
    fasta <- readFASTA(file)
    sequences <- sapply(fasta, function(i) i$seq)
    x <- sapply(sequences, strsplit, split="")
    x <- unlist(x)
    NTprob <- as.data.frame(table(x))
    NTprob[,2] <- NTprob[,2]/sum(NTprob[,2])
    rownames(NTprob) <- NTprob[,1]
    return(NTprob)
}

##' .. content for \description{} (calculate the log likelihood of the six possible reading frame of a given sequence) ..
##'
##' .. content for \details{} ..
##' @title logLikelihood
##' @param sequence : A sequence string
##' @param transition : Transition Probatility Matrix
##' @param codonusage : Codon Usage data.frame
##' @param background : NT Usage data.frame for background/random model
##' @return Log likelihood
##' @author Guangchuang Yu
logLikelihood <- function(sequence, transition, codonusage, background) {
    codonLength <- 3
    orf1 <- sequence
    orf2 <- substr(sequence, 2, nchar(sequence))
    orf3 <- substr(sequence, 3, nchar(sequence))
    seq.rc <- revcom(sequence)
    orf4 <- seq.rc
    orf5 <- substr(seq.rc, 2, nchar(seq.rc))
    orf6 <- substr(seq.rc, 3, nchar(seq.rc))
    ORF <- c(orf1, orf2, orf3, orf4, orf5, orf6)
    names(ORF) <- c("ORF1", "ORF2", "ORF3", "ORF4", "ORF5", "ORF6")

    result <- data.frame(ORF=names(ORF), logLikelihood=rep(0,6))
    for (j in seq_along(ORF)) {
        firstCodon <- substr(ORF[j], 1, 3)
        if (firstCodon %in% codonusage[,1]) {
            initProb <- log(codonusage[firstCodon,2])
        } else {
            initProb <- log(1/64)
        }

        transitionCount <- matrix(0, nrow=nrow(transition), ncol=ncol(transition))
        rownames(transitionCount) <- rownames(transition)
        colnames(transitionCount) <- colnames(transition)

        codons <- rownames(transition)

        len <- nchar(ORF[j])
        len <- len - len %% codonLength - codonLength

        idx <- seq(1, len, codonLength)
        for (i in seq_along(idx)) {
            ## codonI--codonJ
            codonI <- substr(ORF[j],idx[i], idx[i]+codonLength-1)
            if (i != length(idx)) {
                codonJ <- substr(ORF[j],idx[i+1], idx[i+1]+codonLength-1)
            }
            if (codonI %in% codons & codonJ %in% codons) {
                transitionCount[codonI, codonJ] <- transitionCount[codonI, codonJ] +1
            } else {
                warning("codon ", codonI, " or ", codonJ, " which does not defined was detected...")
            }
        }

        postTransition <- log(transition) * transitionCount
        posterior <- initProb + sum(postTransition)

        NTseq <- unlist(strsplit(ORF[j], split=""))
        orf.len <- nchar(ORF[j]) - nchar(ORF[j]) %% codonLength
        NTseq <- NTseq[1:orf.len]
        randomModel <- sum(log(background[NTseq,2]))

        likelihood <- posterior - randomModel
        result[j, 2] <- likelihood
    }
    return(result)
}