Tuesday, February 16, 2016

Rcpp: the Shell sort code with gaps function exemples


shellsoert.htm

/*
 * The method starts by sorting pairs of elements far apart from each other, then progressively reducing the gap between elements to be compared.
 * The running time of Shellsort is heavily dependent on the gap sequence it uses.
 * Many gap function are evaluable see Link.
 * source file:Rcpp: the Shell sort code with gaps function exemples

 * 
 * The best profile was effectuated when we use sort_shell_sed function.
 * 
 *                                 MACHERKI M E.
 *                                 17/02/2016
 * 
 * 
 */

#include <Rcpp.h>
using namespace Rcpp;



//The following code is the typical body of the shell sort algorithm:
inline void shell ( NumericVector unsorted, int size, NumericVector Gaps)
{
  int gn=Gaps.size();
  int j,gap ;
  
  for (int t = gn-1; t > -1; t--)
  {
    gap=Gaps[t];
    for (int i = gap; i < size; ++i)
    {
      double temp = unsorted[i];
      for (j = i; j >= gap && temp < unsorted[j - gap]; j -= gap)
      {
        unsorted[j] = unsorted[j - gap];
      }
      unsorted[j] = temp;
    } 
  }
}
//******************gaps functions*************************

//Many gaps function can be used 
/*Shell sort function using slicing by two gaps (Shell, 1959)
 Gaps= N/2^k={N/2,N/4,N/8...1}
*/
inline NumericVector gapshell( int size ) {
  NumericVector result;
  for(int i=size/2;i>0;i/=2)
    result.push_back(i);
  return rev(result);}
/*Method of Frank & Lazarus, 1960
 Gaps=2*(N/2^k+1)+1
*/
inline NumericVector gapFrank( int N ) {
  int tmp=10;
  NumericVector result;
  for(int k=2;tmp>1;k++){
    tmp=2*(N/pow(2,k))+1;
    result.push_back(tmp);}
  return rev(result);}

/*Method of Hibbard, 1963
 Gaps=2^k-1
 */
inline NumericVector gapHibbard( int N ) {
  NumericVector result;
  int tmp=0;
  for(int k=1;tmp<N;k++){
    tmp=pow(2,k)-1;
    result.push_back(tmp);}
  return result;}

/*Method of Pratt, 1971
 Gaps=(3^k-1)/2
 */
inline NumericVector gappratt( int N ) {
  int n=N/3;
  NumericVector result;
  int tmp=0;
  for(int k=1;tmp<n;k++){
    tmp=(pow(3,k)-1)/2;
    result.push_back(tmp);}
  return result;}
/*Method of Sedgewick, 1986
 Gaps=4^k+3*2^(k-1)+1
 */
inline NumericVector gapsed( int N ) {
  NumericVector result;
  result.push_back(1);
    int tmp=0;
  for(int k=1;;k++){
    tmp=pow(4,k)+(3*pow(2,k-1))+1;
    if(tmp>N)break;
    result.push_back(tmp);}
  return result;}
/*Method of Tokuda, 1992
 Gaps=(9^k-4^k)/(5*4^(k-1))
*/

inline NumericVector gaptocuda( int N ) {
  NumericVector result;
  int tmp=0;
  for(int k=1;;k++){
    tmp=(pow(9,k)-pow(4,k))/(5*pow(4,k-1));
    if(tmp>N)break;
    result.push_back(tmp);}//we can use tmp+1
  result[0]=1;
  return result;}
/*Method of Tokuda, 1992:generic
 Gaps=h[i]=(h[i-1])*2.25+1; h[1]=1,1 base indexing
*/

inline NumericVector gaptocuda_emp( int N ) {
  NumericVector result;
  result.push_back(1);
  int tmp=1;
  for(int k=1;;k++){
    tmp=2.25*tmp+1;
    if(tmp>N)break;
    result.push_back(tmp);}//we can use tmp+1
  return result;}





//using empirically derived gaps (Ciura, 2001) N<2000

NumericVector gapCiura( int sz ) { 
  NumericVector result;
  result.push_back(1);
  result.push_back(4);
  result.push_back(10);
  result.push_back(23);
  result.push_back(57);
  result.push_back(132);
  result.push_back(301);
  result.push_back(701);
  result.push_back(1705);
  return result;}

/******************sorting functions*************************
************************************************************/







// [[Rcpp::export]]
NumericVector sort_shell_gap(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gapshell(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_frunk(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gapFrank(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_hibbard(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gapHibbard(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_patt(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gappratt(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_sed(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gapsed(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_tacuda(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gaptocuda(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_tacudaemp(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector gaps=gaptocuda_emp(N);
  NumericVector result=clone(unsorted);
  shell(result,N,gaps);
  return result;}
// [[Rcpp::export]]
NumericVector sort_shell_Ciura(NumericVector unsorted) { //run
  int N =unsorted.size();
  NumericVector Ciura=gapCiura(0);
  NumericVector result=clone(unsorted);
  shell(result,N,Ciura);
  return result;}
/***R
##trial mode
x<-runif(1000)
base<-sort(x)
a<-sort_shell_gap(x)
b<-sort_shell_frunk(x)
c<-sort_shell_hibbard(x)
d<-sort_shell_patt(x)
e<-sort_shell_sed(x)
f<-sort_shell_tacuda(x)
g<-sort_shell_tacudaemp(x)
h<-sort_shell_Ciura(x)
all.equal.list(base,a,b,c,e,f,g,h)
## timing :
x<-runif(1000000)
system.time(base<-sort(x))
system.time(a<-sort_shell_gap(x))
system.time(b<-sort_shell_frunk(x))
system.time(c<-sort_shell_hibbard(x))
system.time(d<-sort_shell_patt(x))
system.time(e<-sort_shell_sed(x))
system.time(f<-sort_shell_tacuda(x))
system.time(g<-sort_shell_tacudaemp(x))
system.time(h<-sort_shell_Ciura(x))

*/
  


Monday, February 15, 2016

Rcpp: reading file functions codes


read.file.htm
/*
All these function are useful in Reading file.read_file_delime function read the file from start to end.'Tol' is an option to make string into lower case.
                              MACHERKI M E.
                              16/02/2016


Download source file:link

*/


#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include<cctype>
#include<Rcpp.h>
using namespace Rcpp;
//[[Rcpp::export]]
CharacterVector read_file(std::string path){
std::ifstream t(path.c_str());//connect with file 
std::stringstream ss;
ss<<t.rdbuf();                // scan file
return ss.str();
}
//[[Rcpp::export]]
CharacterVector read_file1(std::string path){
std::ifstream in(path.c_str());
std::string contents;
in.seekg(0,std::ios::end);
contents.resize(in.tellg());
in.seekg(0,std::ios::beg);
in.read(&contents[0],contents.size());
in.close();
return contents;
}
// function to read file from start to end pointer
//[[Rcpp::export]]
CharacterVector read_file_delime(std::string path,int start,int end,int TOL){
std::ifstream in(path.c_str());
std::string contents;
in.seekg(0,std::ios::end);
contents.resize(in.tellg());
in.seekg(start,std::ios::beg);
in.read(&contents[0],end);
in.close();
contents.erase(std::remove(contents.begin(),contents.end(),'\n'),contents.end());
//remove lines delimiter
if(TOL>0){//force to lower case transformation
std::transform(contents.begin(),contents.end(),contents.begin(),tolower);
}
return contents.substr(start-1,end-start);
}
/* read a file from start  to end*/
/* set the file path before using */

/*** R
###*******************choose file to read************************####
con<-utils::choose.files()
system.time(FR<-readLines(con))     #vector as the split(x,'\n') 
system.time(Fcpp1<-read_file(con))  #contents Line delimiter
system.time(Fcpp2<-read_file1(con)) #contents Line delimiter
#one string without \n and in lower case
system.time(Fcpp3<-read_file_delime(con,20,100,1)) 

*/ 


Merge sort: Rcpp integration code


mergesort.htm

/*sorting function using the merge sort algorithm
*/

#include <Rcpp.h>
using namespace Rcpp;

void intercal(int p, int q, int r, NumericVector v,  NumericVector w)
{
  int i, j, k;
  i = p;
  j = q;
  k = 0;
  while (i < q && j < r) {
    if (v[i] < v[j]) {
      w[k] = v[i];
      i++;
    }
    else {
      w[k] = v[j];
      j++;
    }
    k++;
  }
  while (i < q) {
    w[k] = v[i];
    i++;
    k++;
  }
  while (j < r) {
    w[k] = v[j];
    j++;
    k++;
  }
  for (i = p; i < r; i++)
    v[i] = w[i-p];
}

void mergesort(int p, int r, NumericVector v, NumericVector aux)
{
  int q;
  if (p < r - 1) {
    q = (p + r) / 2;
    mergesort(p, q, v,aux);
    mergesort(q, r, v,aux);
    intercal(p, q, r, v,aux);
  }
}

// [[Rcpp::export]]
NumericVector sort_merge(NumericVector vetor) {
  Rcpp::NumericVector res = Rcpp::clone(vetor);
  Rcpp::NumericVector aux = Rcpp::clone(vetor);
  int n = res.size();
  mergesort(0,n,res,aux);
  return res;}


The first code:Rcpp

sourceCpp is the important function in Rcpp. The Rcpp code is written into file with'.cpp' extension. The cppFunction can be useful but we can not  easily combine many functions in the same time. This is an example to use the two possibility:
  • ccpFunction

R code:

Rcpp::cppFunction(
'int fibonacci(const int x) {
if (x == 0) return(0);
if (x == 1) return(1);
return (fibonacci(x - 1)) + fibonacci(x - 2);
}')

fibonacci(20)
  • sourceCpp
I used the example used in the Rstudio C++ source file.
Get the source file: firstrun.cpp and save it in the same R directory.

R code:

Rcpp::sourceCpp('firstrun.cpp')

Rcpp:the first step

Rcpp uses the C++ compiler to produce a sharing library connectable with R environment. The fist step is to install  the Rtools program provided in CRAN site. The second step is to install the Rcpp package.It is important to use a good  editor such Rstudio which integrates the C++ code directly.

Why I use Rcpp ?

Rcpp is an R add-on package which facilitates extending R with C++ functions. It is being used for anything from small and quickly constructed add-on functions written either to fluidly experiment with something new or to accelerate computing by replacing an R function with its C++ equivalent to large-scale bindings for existing libraries, or as a building block in entirely new research computing environments. While still relatively new as a project, Rcpp has already become widely deployed among users and developers in the R community. Rcpp is now the most popular language extension for the R system and used by over 500 CRAN packages as well as ten BioConductor packages. This site aims to provide a solid introduction to Rcpp and provides many codes of the most common algorithms.

Sunday, February 14, 2016

Numerical coding of DNA: Rcpp code

htmlsplit.html

/*
In general, DNA sequences are  stored as character data into different formats
of files like FASTA,FNA... ALso numeric coding of DNA in base 4 exists  such 
'A'=0,'G'=1,'C'=2 and 'T'=3. But why I use this kind of coding? 
The first step of sequence alignment algorithms such BLAST or FASTA consists on 
stepwise splitting of the DNA sequence into oligos of 11 bases. This operation 
influences  the execution  running time of the program.
This is an example of codes  ;the first use string manipulation and the second 
use numeric coding in base 5(to overcame  N in the sequence).
PDF file link:source file


                     MACHERKI M.E
                     14/02/2016
 */
 
 /*
 This source file  uses C++ code.You have to install Rcpp package to run it.
 A FASTA file  path is required to test this code in R.Please check the bottom 
 of the page.
 */ 
 
#include <fstream>
#include<Rcpp.h>
#include<string.h>
#include <sstream>
#include<cctype>
#include <algorithm>
using namespace Rcpp;

/*
First of all, this is a function to read FASTA file into R
-----!!!!!!!!!!!!! one sequence only!!!!!!!!!!!!!!-----------
*/
//[[Rcpp::export]]
inline CharacterVector read_fasta(std::string file ){
CharacterVector records;
std::ifstream in(file.c_str());
in.get();// remove first >
std::string rec ;
while(getline(in,rec,'>')){
int newLineLoc=rec.find('\n');
std::string header =rec.substr(0,newLineLoc);
std::string sequence= rec.substr(newLineLoc+1,rec.length()-newLineLoc-2);
sequence.erase(std::remove(sequence.begin(),sequence.end(),'\n'),sequence.end());
std::transform(sequence.begin(),sequence.end(),sequence.begin(),tolower);
records[header]=sequence;
}
return records;
}
/*
String splitter function into N size oligos (Only for lower case string).

*/
// [[Rcpp::export]]
 inline CharacterVector SPLIT(const std::string  strings,const int n){
    int lim=strings.length()-n+1 ;//size of sub string vector
    CharacterVector resultat(lim) ;
          for ( int j=0; j<lim ;j++) resultat[j] = strings.substr(j,n) ;
return resultat ;
} 
/*
String splitter function into N size oligos using numeric coding.
I used a simple hashing function which produce an unique numeric 
identifier for each oligo.It is possible to use integer but I use
double to accept a large N.
(See R help:strtoi or just type ?strtoi in console)

*/

inline double  natoi( const std::string A,const int n ){
//Used as input of numeric_split(see next function)
  int np=n-1;
  IntegerVector temp(n,4);
  char toc;
  for( int i=0;i<n;i++){
    toc=A[i];
if(toc=='a') temp[i]= 0;
if(toc=='t') temp[i]= 3;
if(toc=='c') temp[i]= 1;
if(toc=='g') temp[i]= 2;
}
double  result=0;
 for( int i=0;i<n;i++) result+= pow(5,(np-i))*temp[i];
 return result;
}

// [[Rcpp::export]]
inline NumericVector  numeric_split(const std::string A,const int n)
    {
  const int np=n-1;
  const int Lim=A.length()-np;//-- number of subsequence 
  NumericVector res(Lim);     //-- result vector
  double p=0;                 //-- temp result accumulator
  res[0]=p=natoi(A,n);        //-- first result 
  const double  N=pow(5,np);  //-- cont used in recursion formula
  unsigned short int x,y;     //-- temp to allocate forward and backward values
  char fo,ba;                 //--forward and backward chars
  for( int i=1;i<Lim;i++)
     {
       fo=A[i-1];ba=A[i+np];
       x=y=4;                 //-- empty case:n,p,m... 
       if(fo=='a')x=0;        //-- char detector 
       if(fo=='t')x=3;        // 
       if(fo=='g')x=2;        // 
       if(fo=='c')x=1;
       if(ba=='a')y=0;
       if(ba=='t')y=3;
       if(ba=='g')y=2;
       if(ba=='c')y=1;
     res[i]=p= 5*p-5*N*x+y;   //-- scalar formula 
     }
  return res;
  }
/*
R verification :
I use a file named k12.fna in my computer (full E.coli k12 genome ). 
You have just to change k12.fna into your file name.
*/


/***R
x<-read_fasta("k12.fna")  ##change me to your file name 
system.time(charsplit<-SPLIT(x,11))
#user  system elapsed 
# 5.38    0.02    5.39 
system.time(numsplit<-numeric_split(x,11))
#user  system elapsed 
#0.08    0.01    0.09    ## more then 50 time faster
  
  */