Logo Search packages:      
Sourcecode: velvet version File versions

tightString.c

/*
Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)

    This file is part of Velvet.

    Velvet is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Velvet is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Velvet; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include "globals.h"

typedef unsigned char Codon;

struct tString_st {
      Codon *sequence;
      Coordinate length;
      Coordinate arrayLength;
};

static const Nucleotide Adenine = 0;
static const Nucleotide Cytosine = 1;
static const Nucleotide Guanine = 2;
static const Nucleotide Thymine = 3;

// Binary 11111100
static const Codon FILTER0 = ~((Codon) 3);
// Binary 11110011
static const Codon FILTER1 = ~(((Codon) 3) << 2);
// Binary 11001111
static const Codon FILTER2 = ~(((Codon) 3) << 4);
// Binary 00111111
static const Codon FILTER3 = (Codon) ~ ((Codon) 3 << 6);

//
// Adds a number into the Codon pointed to by codonPtr at the desired
// position (0, 1, 2, or 3);
//
void
writeNucleotideNumber(Nucleotide nucleotide, Codon * codonPtr,
                  Coordinate position)
{
      if (position == 3) {
            *codonPtr &= FILTER3;
            *codonPtr += nucleotide << 6;
      } else if (position == 2) {
            *codonPtr &= FILTER2;
            *codonPtr += nucleotide << 4;
      } else if (position == 1) {
            *codonPtr &= FILTER1;
            *codonPtr += nucleotide << 2;
      } else if (position == 0) {
            *codonPtr &= FILTER0;
            *codonPtr += nucleotide;
      }
}

//
// Adds a nucleotide into the Codon pointed to by codonPtr at the desired
// position (0, 1, 2, or 3);
//
Nucleotide charToNucleotide(char c)
{
      switch (c) {
      case 'A':
            return Adenine;
      case 'C':
            return Cytosine;
      case 'G':
            return Guanine;
      case 'T':
            return Thymine;
      case 'a':
            return Adenine;
      case 'c':
            return Cytosine;
      case 'g':
            return Guanine;
      case 't':
            return Thymine;
      case '\n':
            return '\n';
      default:
            return Adenine;
      }

}

//
// Adds a nucleotide into the Codon pointed to by codonPtr at the desired
// position (0, 1, 2, or 3);
//
void writeNucleotide(Nucleotide nucleotide, Codon * codonPtr, int position)
{
      int nucleotideNum;

      switch (nucleotide) {
      case 'A':
            nucleotideNum = Adenine;
            break;
      case 'C':
            nucleotideNum = Cytosine;
            break;
      case 'G':
            nucleotideNum = Guanine;
            break;
      case 'T':
            nucleotideNum = Thymine;
            break;
      case 'a':
            nucleotideNum = Adenine;
            break;
      case 'c':
            nucleotideNum = Cytosine;
            break;
      case 'g':
            nucleotideNum = Guanine;
            break;
      case 't':
            nucleotideNum = Thymine;
            break;
      default:
            nucleotideNum = Adenine;
      }

      writeNucleotideNumber(nucleotideNum, codonPtr, position);
}

//
// Creates a tightString from a tradionnal string of A,T,G, and C of length size
//
TightString *newTightStringFromString(char *sequence)
{
      TightString *newTString = malloc(sizeof(TightString));
      if (newTString == NULL) {
            puts("Malloc failure");
            exit(1);
      }

      int size = (int) strlen(sequence);
      int arrayLength = size / 4;
      int index;

      if (size % 4 > 0)
            arrayLength++;

      newTString->length = size;
      newTString->arrayLength = arrayLength;
      newTString->sequence = calloc(arrayLength, sizeof(Codon));

      if (newTString->sequence == NULL && arrayLength > 0) {
            puts("Calloc failure");
            exit(1);
      }

      for (index = 0; index < arrayLength; index++)
            newTString->sequence[index] = 0;

      for (index = 0; index < size; index++)
            writeNucleotide(sequence[index],
                        &(newTString->sequence[index / 4]),
                        index % 4);

      free(sequence);
      return newTString;
}

//
// Creates a tightString from an array of normal strings
//
TightString **newTightStringArrayFromStringArray(char **sequences,
                                     IDnum sequenceCount)
{
      IDnum sequenceIndex;
      TightString **tStringArray =
          malloc(sizeof(TightString *) * sequenceCount);
      if (tStringArray == NULL && sequenceCount > 0) {
            puts("Malloc failure");
            exit(1);
      }

      for (sequenceIndex = 0; sequenceIndex < sequenceCount;
           sequenceIndex++)
            tStringArray[sequenceIndex] =
                newTightStringFromString(sequences[sequenceIndex]);

      free(sequences);
      return tStringArray;
}

char readNucleotide(Nucleotide nucleotide)
{
      switch (nucleotide) {
      case 0:
            return 'A';
      case 1:
            return 'C';
      case 2:
            return 'G';
      case 3:
            return 'T';
      }

      return '?';
}

char *readTightString(TightString * tString)
{
      Coordinate index, index4;
      char *string;
      Codon codon;

      if (tString == NULL || tString->length == 0) {
            string = calloc(5, sizeof(char));
            if (string == NULL) {
                  puts("Calloc failure");
                  exit(1);
            }
            strcpy(string, "VOID");
            return string;
      }

      string = calloc(tString->length + 1, sizeof(char));
      if (string == NULL) {
            puts("Calloc failure");
            exit(1);
      }

      for (index = 0; index < tString->length / 4; index++) {
            index4 = index << 2;
            codon = tString->sequence[index];
            string[index4] = readNucleotide(codon & 3);
            string[index4 + 1] = readNucleotide((codon & 12) >> 2);
            string[index4 + 2] = readNucleotide((codon & 48) >> 4);
            string[index4 + 3] = readNucleotide((codon & 192) >> 6);
      }

      index4 = index << 2;
      codon = tString->sequence[index];

      switch (tString->length % 4) {
      case 3:
            string[index4 + 3] = readNucleotide((codon & 192) >> 6);
      case 2:
            string[index4 + 2] = readNucleotide((codon & 48) >> 4);
      case 1:
            string[index4 + 1] = readNucleotide((codon & 12) >> 2);
      case 0:
            string[index4] = readNucleotide(codon & 3);
      }

      string[tString->length] = '\0';

      return string;
}

Nucleotide getNucleotide(Coordinate nucleotideIndex, TightString * tString)
{
      Codon codon = tString->sequence[nucleotideIndex / 4];

      switch (nucleotideIndex % 4) {
      case 3:
            return (codon & 192) >> 6;
      case 2:
            return (codon & 48) >> 4;
      case 1:
            return (codon & 12) >> 2;
      case 0:
            return (codon & 3);
      }

      return '?';
}

void readTightStringFragment(TightString * tString, Coordinate start,
                       Coordinate finish, char *string)
{
      Coordinate index;
      Coordinate inFinish = finish;

      if (start >= tString->length) {
            string[0] = '\0';
            return;
      }

      if (inFinish > tString->length)
            inFinish = tString->length;

      for (index = start; index < inFinish; index++) {
            string[index - start] =
                readNucleotide(getNucleotide(index, tString));
      }

      string[inFinish - start] = '\0';
}

char getNucleotideChar(Coordinate nucleotideIndex, TightString * tString)
{
      Codon codon;

      // DEBUG
      if (nucleotideIndex >= tString->length)
            abort();

      codon = tString->sequence[nucleotideIndex / 4];

      switch (nucleotideIndex % 4) {
      case 3:
            return readNucleotide((codon & 192) >> 6);
      case 2:
            return readNucleotide((codon & 48) >> 4);
      case 1:
            return readNucleotide((codon & 12) >> 2);
      case 0:
            return readNucleotide((codon & 3));
      }

      return '?';
}

char getInverseNucleotideChar(Coordinate nucleotideIndex,
                        TightString * tString)
{
      Codon codon = tString->sequence[nucleotideIndex / 4];

      switch (nucleotideIndex % 4) {
#ifndef COLOR
      case 3:
            return readNucleotide(3 - ((codon & 192) >> 6));
      case 2:
            return readNucleotide(3 - ((codon & 48) >> 4));
      case 1:
            return readNucleotide(3 - ((codon & 12) >> 2));
      case 0:
            return readNucleotide(3 - ((codon & 3)));
#else
      case 3:
            return readNucleotide(((codon & 192) >> 6));
      case 2:
            return readNucleotide(((codon & 48) >> 4));
      case 1:
            return readNucleotide(((codon & 12) >> 2));
      case 0:
            return readNucleotide(((codon & 3)));
#endif
      }

      return '?';
}

TightString *newTightString(Coordinate length)
{
      Coordinate arrayLength = length / 4;
      Coordinate index;
      TightString *newTString = malloc(sizeof(TightString));
      if (newTString == NULL) {
            puts("Malloc failure");
            exit(1);
      }
      if (length % 4 > 0)
            arrayLength++;

      newTString->length = length;
      newTString->arrayLength = arrayLength;
      newTString->sequence = calloc(arrayLength, sizeof(Codon));

      if (newTString->sequence == NULL && arrayLength > 0) {
            puts("Calloc failure");
            exit(1);
      }

      for (index = 0; index < arrayLength; index++)
            newTString->sequence[index] = 0;

      return newTString;
}

void
writeNucleotideAtPosition(Nucleotide nucleotide, Coordinate position,
                    TightString * tString)
{
      if (position >= tString->length)
            return;

      writeNucleotideNumber(nucleotide,
                        &tString->sequence[position / 4],
                        position % 4);
}

void trimTightString(TightString * tString, Coordinate length)
{
      Codon *tmpPtr;
      Coordinate newArrayLength = length / 4;
      if (length % 4 == 0)
            newArrayLength++;

      tString->length = length;
      tString->arrayLength = newArrayLength;
      tmpPtr =
          realloc(tString->sequence, newArrayLength * sizeof(Codon));
      if (tmpPtr != NULL)
            tString->sequence = tmpPtr;
}

Coordinate getLength(TightString * tString)
{
      return tString->length;
}

TightString **concatenateTightStringArrays(TightString ** array1,
                                 TightString ** array2,
                                 IDnum size1, IDnum size2)
{
      TightString **unionArray;
      IDnum index;

      if (array1 == NULL)
            return array2;

      if (array2 == NULL)
            return array1;

      unionArray =
          realloc(array1, sizeof(TightString *) * (size1 + size2));
      if (unionArray == NULL) {
            puts("Reallocation failed!");
            exit(1);
      }

      for (index = 0; index < size2; index++)
            unionArray[size1 + index] = array2[index];

      free(array2);

      return unionArray;
}

void destroyTightString(TightString * tString)
{
      free(tString->sequence);
      free(tString);
}

void destroyTightStringArray(TightString ** array, IDnum sequenceCount)
{
      IDnum index;
      for (index = 0; index < sequenceCount; index++)
            destroyTightString(array[index]);
      free(array);
}

//Reverses a word into its Watson-Crick reverse complement
Kmer reverseComplement(Kmer word, int WORDLENGTH)
{
      int index;
      Kmer revComp = 0;
      Kmer copy = word;
      Nucleotide nucleotide;

      for (index = 0; index < WORDLENGTH; index++) {
            nucleotide = (char) (copy & 3);
            revComp <<= 2;
#ifndef COLOR
            revComp += 3 - nucleotide;
#else
            revComp += nucleotide;
#endif
            copy >>= 2;
      }

      return revComp;
}

void setTightStringLength(TightString * tString, Coordinate length)
{
      Coordinate newArrayLength = length / 4;
      if (length % 4 > 0)
            newArrayLength++;

      if (newArrayLength > tString->arrayLength) {
            tString->sequence =
                realloc(tString->sequence,
                      newArrayLength * sizeof(Codon));
            tString->arrayLength = newArrayLength;
      }

      if (tString->sequence == NULL) {
            puts("Allocation error!");
            exit(0);
      }

      tString->length = length;
}

// Shortens reads to a fixed size (good for Solexa where errors are markedly towards the end)
void trimTightStringArray(TightString ** array, IDnum sequenceCount,
                    Coordinate length)
{
      IDnum index;

      for (index = 0; index < sequenceCount; index++)
            trimTightString(array[index], length);
}

void trimTightStringArraySanger(TightString ** array, IDnum sequenceCount,
                        Coordinate min, Coordinate max)
{
      IDnum index;

      for (index = 0; index < sequenceCount; index++) {
            if (getLength(array[index]) > max)
                  trimTightString(array[index], max);
            else if (getLength(array[index]) < min)
                  trimTightString(array[index], 0);
      }
}

void clipTightString(TightString * tString, Coordinate start,
                 Coordinate finish)
{
      Coordinate position;
      Coordinate newLength = finish - start;

      for (position = 0; position < newLength; position++)
            writeNucleotideAtPosition(getNucleotide
                                (position + start, tString),
                                position, tString);

      trimTightString(tString, newLength);
}

Nucleotide getNucleotideFromString(Coordinate nucleotideIndex,
                           char *string)
{
      char letter = string[nucleotideIndex];

      switch (letter) {
      case 'A':
            return Adenine;
      case 'C':
            return Cytosine;
      case 'G':
            return Guanine;
      case 'T':
            return Thymine;
      default:
            return Adenine;
      }
}

void exportTightString(FILE * outfile, TightString * sequence, IDnum index)
{
      Coordinate start, finish;
      char str[100];

      if (sequence == NULL)
            return;

      fprintf(outfile, ">SEQUENCE_%li_length_%li\n", index,
            getLength(sequence));

      start = 0;
      while (start <= getLength(sequence)) {
            finish = start + 60;
            readTightStringFragment(sequence, start, finish, str);
            fprintf(outfile, "%s\n", str);
            start = finish;
      }

      fflush(outfile);
}

void exportSequenceArray(char *filename, TightString ** array,
                   IDnum sequenceCount)
{
      IDnum index;
      FILE *outfile = fopen(filename, "w+");

      if (outfile == NULL) {
            puts("Couldn't open file, sorry");
            return;
      } else
            printf("Writing into file: %s\n", filename);

      for (index = 0; index < sequenceCount; index++) {
            exportTightString(outfile, array[index], index);
      }

      fclose(outfile);

      puts("Done");
}

Generated by  Doxygen 1.6.0   Back to index