/*
* The method starts by sorting pairs of elements far apart from each other, then progressively reducing the gap between elements to be compared.
* The running time of Shellsort is heavily dependent on the gap sequence it uses.
* Many gap function are evaluable see Link.
* source file:Rcpp: the Shell sort code with gaps function exemples
*
* The best profile was effectuated when we use sort_shell_sed function.
*
* MACHERKI M E.
* 17/02/2016
*
*
*/
#include <Rcpp.h>
using namespace Rcpp;
//The following code is the typical body of the shell sort algorithm:
inline void shell ( NumericVector unsorted, int size, NumericVector Gaps)
{
int gn=Gaps.size();
int j,gap ;
for (int t = gn-1; t > -1; t--)
{
gap=Gaps[t];
for (int i = gap; i < size; ++i)
{
double temp = unsorted[i];
for (j = i; j >= gap && temp < unsorted[j - gap]; j -= gap)
{
unsorted[j] = unsorted[j - gap];
}
unsorted[j] = temp;
}
}
}
//******************gaps functions*************************
//Many gaps function can be used
/*Shell sort function using slicing by two gaps (Shell, 1959)
Gaps= N/2^k={N/2,N/4,N/8...1}
*/
inline NumericVector gapshell( int size ) {
NumericVector result;
for(int i=size/2;i>0;i/=2)
result.push_back(i);
return rev(result);}
/*Method of Frank & Lazarus, 1960
Gaps=2*(N/2^k+1)+1
*/
inline NumericVector gapFrank( int N ) {
int tmp=10;
NumericVector result;
for(int k=2;tmp>1;k++){
tmp=2*(N/pow(2,k))+1;
result.push_back(tmp);}
return rev(result);}
/*Method of Hibbard, 1963
Gaps=2^k-1
*/
inline NumericVector gapHibbard( int N ) {
NumericVector result;
int tmp=0;
for(int k=1;tmp<N;k++){
tmp=pow(2,k)-1;
result.push_back(tmp);}
return result;}
/*Method of Pratt, 1971
Gaps=(3^k-1)/2
*/
inline NumericVector gappratt( int N ) {
int n=N/3;
NumericVector result;
int tmp=0;
for(int k=1;tmp<n;k++){
tmp=(pow(3,k)-1)/2;
result.push_back(tmp);}
return result;}
/*Method of Sedgewick, 1986
Gaps=4^k+3*2^(k-1)+1
*/
inline NumericVector gapsed( int N ) {
NumericVector result;
result.push_back(1);
int tmp=0;
for(int k=1;;k++){
tmp=pow(4,k)+(3*pow(2,k-1))+1;
if(tmp>N)break;
result.push_back(tmp);}
return result;}
/*Method of Tokuda, 1992
Gaps=(9^k-4^k)/(5*4^(k-1))
*/
inline NumericVector gaptocuda( int N ) {
NumericVector result;
int tmp=0;
for(int k=1;;k++){
tmp=(pow(9,k)-pow(4,k))/(5*pow(4,k-1));
if(tmp>N)break;
result.push_back(tmp);}//we can use tmp+1
result[0]=1;
return result;}
/*Method of Tokuda, 1992:generic
Gaps=h[i]=(h[i-1])*2.25+1; h[1]=1,1 base indexing
*/
inline NumericVector gaptocuda_emp( int N ) {
NumericVector result;
result.push_back(1);
int tmp=1;
for(int k=1;;k++){
tmp=2.25*tmp+1;
if(tmp>N)break;
result.push_back(tmp);}//we can use tmp+1
return result;}
//using empirically derived gaps (Ciura, 2001) N<2000
NumericVector gapCiura( int sz ) {
NumericVector result;
result.push_back(1);
result.push_back(4);
result.push_back(10);
result.push_back(23);
result.push_back(57);
result.push_back(132);
result.push_back(301);
result.push_back(701);
result.push_back(1705);
return result;}
/******************sorting functions*************************
************************************************************/
// [[Rcpp::export]]
NumericVector sort_shell_gap(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gapshell(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_frunk(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gapFrank(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_hibbard(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gapHibbard(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_patt(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gappratt(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_sed(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gapsed(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_tacuda(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gaptocuda(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_tacudaemp(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector gaps=gaptocuda_emp(N);
NumericVector result=clone(unsorted);
shell(result,N,gaps);
return result;}
// [[Rcpp::export]]
NumericVector sort_shell_Ciura(NumericVector unsorted) { //run
int N =unsorted.size();
NumericVector Ciura=gapCiura(0);
NumericVector result=clone(unsorted);
shell(result,N,Ciura);
return result;}
/***R
##trial mode
x<-runif(1000)
base<-sort(x)
a<-sort_shell_gap(x)
b<-sort_shell_frunk(x)
c<-sort_shell_hibbard(x)
d<-sort_shell_patt(x)
e<-sort_shell_sed(x)
f<-sort_shell_tacuda(x)
g<-sort_shell_tacudaemp(x)
h<-sort_shell_Ciura(x)
all.equal.list(base,a,b,c,e,f,g,h)
## timing :
x<-runif(1000000)
system.time(base<-sort(x))
system.time(a<-sort_shell_gap(x))
system.time(b<-sort_shell_frunk(x))
system.time(c<-sort_shell_hibbard(x))
system.time(d<-sort_shell_patt(x))
system.time(e<-sort_shell_sed(x))
system.time(f<-sort_shell_tacuda(x))
system.time(g<-sort_shell_tacudaemp(x))
system.time(h<-sort_shell_Ciura(x))
*/
Tuesday, February 16, 2016
Rcpp: the Shell sort code with gaps function exemples
Monday, February 15, 2016
Rcpp: reading file functions codes
/*
All these function are useful in Reading file.read_file_delime function read the file from start to end.'Tol' is an option to make string into lower case.
MACHERKI M E.
16/02/2016
Download source file:link
*/
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include<cctype>
#include<Rcpp.h>
using namespace Rcpp;
//[[Rcpp::export]]
CharacterVector read_file(std::string path){
std::ifstream t(path.c_str());//connect with file
std::stringstream ss;
ss<<t.rdbuf(); // scan file
return ss.str();
}
//[[Rcpp::export]]
CharacterVector read_file1(std::string path){
std::ifstream in(path.c_str());
std::string contents;
in.seekg(0,std::ios::end);
contents.resize(in.tellg());
in.seekg(0,std::ios::beg);
in.read(&contents[0],contents.size());
in.close();
return contents;
}
// function to read file from start to end pointer
//[[Rcpp::export]]
CharacterVector read_file_delime(std::string path,int start,int end,int TOL){
std::ifstream in(path.c_str());
std::string contents;
in.seekg(0,std::ios::end);
contents.resize(in.tellg());
in.seekg(start,std::ios::beg);
in.read(&contents[0],end);
in.close();
contents.erase(std::remove(contents.begin(),contents.end(),'\n'),contents.end());
//remove lines delimiter
if(TOL>0){//force to lower case transformation
std::transform(contents.begin(),contents.end(),contents.begin(),tolower);
}
return contents.substr(start-1,end-start);
}
/* read a file from start to end*/
/* set the file path before using */
/*** R
###*******************choose file to read************************####
con<-utils::choose.files()
system.time(FR<-readLines(con)) #vector as the split(x,'\n')
system.time(Fcpp1<-read_file(con)) #contents Line delimiter
system.time(Fcpp2<-read_file1(con)) #contents Line delimiter
#one string without \n and in lower case
system.time(Fcpp3<-read_file_delime(con,20,100,1))
*/
Merge sort: Rcpp integration code
/*sorting function using the merge sort algorithm
*/
#include <Rcpp.h>
using namespace Rcpp;
void intercal(int p, int q, int r, NumericVector v, NumericVector w)
{
int i, j, k;
i = p;
j = q;
k = 0;
while (i < q && j < r) {
if (v[i] < v[j]) {
w[k] = v[i];
i++;
}
else {
w[k] = v[j];
j++;
}
k++;
}
while (i < q) {
w[k] = v[i];
i++;
k++;
}
while (j < r) {
w[k] = v[j];
j++;
k++;
}
for (i = p; i < r; i++)
v[i] = w[i-p];
}
void mergesort(int p, int r, NumericVector v, NumericVector aux)
{
int q;
if (p < r - 1) {
q = (p + r) / 2;
mergesort(p, q, v,aux);
mergesort(q, r, v,aux);
intercal(p, q, r, v,aux);
}
}
// [[Rcpp::export]]
NumericVector sort_merge(NumericVector vetor) {
Rcpp::NumericVector res = Rcpp::clone(vetor);
Rcpp::NumericVector aux = Rcpp::clone(vetor);
int n = res.size();
mergesort(0,n,res,aux);
return res;}
The first code:Rcpp
sourceCpp is the important function in Rcpp. The Rcpp code is written into file with'.cpp' extension. The cppFunction can be useful but we can not easily combine many functions in the same time. This is an example to use the two possibility:
- ccpFunction
R code:
Rcpp::cppFunction(
'int fibonacci(const int x) {
if (x == 0) return(0);
if (x == 1) return(1);
return (fibonacci(x - 1)) + fibonacci(x - 2);
}')
fibonacci(20)
- sourceCpp
I used the example used in the Rstudio C++ source file.
Get the source file: firstrun.cpp and save it in the same R directory.
R code:
Rcpp::sourceCpp('firstrun.cpp')
Rcpp:the first step
Rcpp uses the C++ compiler to produce a sharing library connectable with R environment. The fist step is to install the Rtools program provided in CRAN site. The second step is to install the Rcpp package.It is important to use a good editor such Rstudio which integrates the C++ code directly.
Why I use Rcpp ?
Rcpp is an R add-on package which facilitates extending R with C++ functions. It is being used for anything from small and quickly constructed add-on functions written either to fluidly experiment with something new or to accelerate computing by replacing an R function with its C++ equivalent to large-scale bindings for existing libraries, or as a building block in entirely new research computing environments. While still relatively new as a project, Rcpp has already become widely deployed among users and developers in the R community. Rcpp is now the most popular language extension for the R system and used by over 500 CRAN packages as well as ten BioConductor packages. This site aims to provide a solid introduction to Rcpp and provides many codes of the most common algorithms.
Sunday, February 14, 2016
Numerical coding of DNA: Rcpp code
/*
In general, DNA sequences are stored as character data into different formats
of files like FASTA,FNA... ALso numeric coding of DNA in base 4 exists such
'A'=0,'G'=1,'C'=2 and 'T'=3. But why I use this kind of coding?
The first step of sequence alignment algorithms such BLAST or FASTA consists on
stepwise splitting of the DNA sequence into oligos of 11 bases. This operation
influences the execution running time of the program.
This is an example of codes ;the first use string manipulation and the second
use numeric coding in base 5(to overcame N in the sequence).
PDF file link:source file
MACHERKI M.E
14/02/2016
*/
/*
This source file uses C++ code.You have to install Rcpp package to run it.
A FASTA file path is required to test this code in R.Please check the bottom
of the page.
*/
#include <fstream>
#include<Rcpp.h>
#include<string.h>
#include <sstream>
#include<cctype>
#include <algorithm>
using namespace Rcpp;
/*
First of all, this is a function to read FASTA file into R
-----!!!!!!!!!!!!! one sequence only!!!!!!!!!!!!!!-----------
*/
//[[Rcpp::export]]
inline CharacterVector read_fasta(std::string file ){
CharacterVector records;
std::ifstream in(file.c_str());
in.get();// remove first >
std::string rec ;
while(getline(in,rec,'>')){
int newLineLoc=rec.find('\n');
std::string header =rec.substr(0,newLineLoc);
std::string sequence= rec.substr(newLineLoc+1,rec.length()-newLineLoc-2);
sequence.erase(std::remove(sequence.begin(),sequence.end(),'\n'),sequence.end());
std::transform(sequence.begin(),sequence.end(),sequence.begin(),tolower);
records[header]=sequence;
}
return records;
}
/*
String splitter function into N size oligos (Only for lower case string).
*/
// [[Rcpp::export]]
inline CharacterVector SPLIT(const std::string strings,const int n){
int lim=strings.length()-n+1 ;//size of sub string vector
CharacterVector resultat(lim) ;
for ( int j=0; j<lim ;j++) resultat[j] = strings.substr(j,n) ;
return resultat ;
}
/*
String splitter function into N size oligos using numeric coding.
I used a simple hashing function which produce an unique numeric
identifier for each oligo.It is possible to use integer but I use
double to accept a large N.
(See R help:strtoi or just type ?strtoi in console)
*/
inline double natoi( const std::string A,const int n ){
//Used as input of numeric_split(see next function)
int np=n-1;
IntegerVector temp(n,4);
char toc;
for( int i=0;i<n;i++){
toc=A[i];
if(toc=='a') temp[i]= 0;
if(toc=='t') temp[i]= 3;
if(toc=='c') temp[i]= 1;
if(toc=='g') temp[i]= 2;
}
double result=0;
for( int i=0;i<n;i++) result+= pow(5,(np-i))*temp[i];
return result;
}
// [[Rcpp::export]]
inline NumericVector numeric_split(const std::string A,const int n)
{
const int np=n-1;
const int Lim=A.length()-np;//-- number of subsequence
NumericVector res(Lim); //-- result vector
double p=0; //-- temp result accumulator
res[0]=p=natoi(A,n); //-- first result
const double N=pow(5,np); //-- cont used in recursion formula
unsigned short int x,y; //-- temp to allocate forward and backward values
char fo,ba; //--forward and backward chars
for( int i=1;i<Lim;i++)
{
fo=A[i-1];ba=A[i+np];
x=y=4; //-- empty case:n,p,m...
if(fo=='a')x=0; //-- char detector
if(fo=='t')x=3; //
if(fo=='g')x=2; //
if(fo=='c')x=1;
if(ba=='a')y=0;
if(ba=='t')y=3;
if(ba=='g')y=2;
if(ba=='c')y=1;
res[i]=p= 5*p-5*N*x+y; //-- scalar formula
}
return res;
}
/*
R verification :
I use a file named k12.fna in my computer (full E.coli k12 genome ).
You have just to change k12.fna into your file name.
*/
/***R
x<-read_fasta("k12.fna") ##change me to your file name
system.time(charsplit<-SPLIT(x,11))
#user system elapsed
# 5.38 0.02 5.39
system.time(numsplit<-numeric_split(x,11))
#user system elapsed
#0.08 0.01 0.09 ## more then 50 time faster
*/
Subscribe to:
Posts (Atom)