
Perl tutorial Working with DNA Sequences #!/usr/bin/perl -w # Storing DNA in a variable, and printing it out # First we store the DNA in a variable called $DNA $DNA = 'ACGGGAGGACGGGAAAATTACTACGGCATTAGC'; # Next, we print the DNA onto the screen print $DNA; # Finally, we'll specifically tell the program to exit. exit; Concatenating the DNA sequences #!/usr/bin/perl -w # Concatenating DNA # Store two DNA fragments into variables called $DNA1 #and $DNA2 $DNA1 = 'ACGGGAGGACGGGAAAATTACTACGGCATTAGC'; $DNA2 = 'ATAGTGCCGTGAGAGTGATGTAGTA'; # Print the DNA onto the screen print "Here are the original two DNA fragments:\n\n"; print $DNA1, "\n"; print $DNA2, "\n\n"; # Concatenate the DNA fragments into a third variable and #print them Using "string interpolation" $DNA3 = "$DNA1$DNA2"; print "Here is the new DNA of the two fragments version 1):\n\n"; print "$DNA3\n\n"; # An alternative way using the "dot operator": # Concatenate the DNA fragments into a third variable and # print them $DNA3 = $DNA1 . $DNA2; print "Here is the concatenation of the first two fragments (version 2):\n\n"; print "$DNA3\n\n"; # Print the same thing without using the variable $DNA3 print "Here is the concatenation of the first two fragments (version 3):\n\n"; print $DNA1, $DNA2, "\n"; exit; TRANSCRIPTION: DNA -> RNA #!/usr/bin/perl -w # Transcribing DNA into RNA # The DNA $DNA = 'ACGGGAGGACGGGAAAATTACTACGGCATTAGC'; # Print the DNA onto the screen print "Here is the starting DNA:\n\n"; print "$DNA\n\n"; # Transcribe the DNA to RNA by substituting all T's with U's. $RNA = $DNA; $RNA =~ s/T/U/g; # Print the RNA onto the screen print "Here is the result of transcribing the DNA to RNA:\n\n"; print "$RNA\n"; # Exit the program. exit; Reverse Complement #!/usr/bin/perl -w # Calculating the reverse complement of a strand of DNA # The DNA $DNA = 'ACGGGAGGACGGGAAAATTACTACGGCATTAGC'; # Print the DNA onto the screen print "Here is the starting DNA:\n\n"; print "$DNA\n\n"; # Calculate the reverse complement # First, copy the DNA into new variable $revcom # (short for REVerse COMplement) # # It doesn't matter if we first reverse the string and then # do the complementation; or if we first do the complementation # and then reverse the string. Same result each time. # So when we make the copy we'll do the reverse in the same statement. $revcom = reverse $DNA; ----- The DNA is now reversed.. we neeed to complement the bases in revcom - substitute all bases by their complements. # A->T, T->A, G->C, C->G ####Attempt 1: $revcom =~ s/A/T/g; $revcom =~ s/T/A/g; $revcom =~ s/G/C/g; $revcom =~ s/C/G/g; # Print the reverse complement DNA onto the screen print "Here is the reverse complement DNA:\n\n"; print "$revcom\n"; ################# Does this work?? Why? # See the text for a discussion of tr/// $revcom =~ tr/ACGTacgt/TGCAtgca/; # Print the reverse complement DNA onto the screen print "Here is the reverse complement DNA:\n\n"; print "$revcom\n"; print "\nThis time it worked!\n\n"; exit; Reading Proteins in Files #!/usr/bin/perl -w # Reading protein sequence data from a file # The filename of the file containing the protein sequence data $proteinfilename = 'Name_Of_your_sequence_file.txt'; # First we have to "open" the file, and associate # a "filehandle" with it. We choose the filehandle # PROTEINFILE for readability. open(PROTEINFILE, $proteinfilename) || Die ("cannot open file"); # Now we do the actual reading of the protein sequence data from the file, by using the angle brackets < and > to get the input from the filehandle. We store the data into our variable $protein. @protein = <PROTEINFILE>; # Now that we've got our data, we can close the file. close PROTEINFILE; # Print the protein onto the screen print "Here is the protein:\n\n"; print @protein; exit; Pattern matching: MotiFs and Loops Proceed ONLY iF condition is true... code layout.. if (condition) { do something } Finding Motifs #!/usr/bin/perl -w # if-elsif-else $word = 'MNIDDKL'; # if-elsif-else conditionals if($word eq 'QSTVSGE') { print "QSTVSGE\n"; } elsif($word eq 'MRQQDMISHDEL') { print "MRQQDMISHDEL\n"; } GC CONTENT In PCR experiments, the GC-content oF primers are used to predict their annealing temperature to the template DNA. A higher GC-content level indicates a higher melting temperature. GC % = G + C x100 A+G+C+T Logical: for each base in the DNA if base is A count_of_A = count_of_A + 1 if base is C count_of_C = count_of_C + 1 if base is G count_of_G = count_of_G + 1 if base is T count_of_T = count_of_T + 1 done print count_of_A, count_of_C, count_of_G, count_of_T the script #!/usr/bin/perl -w # Determining frequency of nucleotides # Get the name of the file with the DNA sequence data $dna_filename = File_name.txt; # Remove the newline from the DNA filename chomp $dna_filename; # open the file, or exit open(DNAFILE, $dna_filename) || die ("Cannot open file \"$dna_filename\"); exit; } # Read the DNA sequence data from the file, and store it # into the array variable @DNA @DNA = <DNAFILE>; # Close the file close DNAFILE; # From the lines of the DNA file, # put the DNA sequence data into a single string. $DNA = join( '', @DNA); # Remove whitespace $DNA =~ s/\s//g; # Now explode the DNA into an array where each letter of # the original string is now an element in the array. # This will make it easy to look at each position. # Notice that we're reusing the variable @DNA for this purpose. @DNA = split( '', $DNA ); # Initialize the counts. # Notice that we can use scalar variables to hold numbers. $count_of_A = 0; $count_of_C = 0; $count_of_G = 0; $count_of_T = 0; $errors = 0; # In a loop, look at each base in turn, determine which of # the four types of nucleotides it is, and increment the # appropriate count. foreach $base (@DNA) { if ( $base eq 'A' ) { ++$count_of_A; } elsif ( $base eq 'C' ) { ++$count_of_C; } elsif ( $base eq 'G' ) { ++$count_of_G; } elsif ( $base eq 'T' ) { ++$count_of_T; } else { print "!!!!!!!! Error - I don\'t recognize this base: $base\n"; ++$errors; } } # print the results print "A = $count_of_A\n"; print "C = $count_of_C\n"; print "G = $count_of_G\n"; print "T = $count_of_T\n"; print "errors = $errors\n"; # exit the program exit; ---using regex --- while($DNA =~ /a/ig){$a++} while($DNA =~ /c/ig){$c++} while($DNA =~ /g/ig){$g++} while($DNA =~ /t/ig){$t++} while($DNA =~ /[^acgt]/ig){$e++} print "A=$a C=$c G=$g T=$t errors=$e\n"; ---- Next is a new kind of loop, the foreach loop. This loop works over the elements of an array. The line: foreach $base (@DNA) Wrtiting to Files # Also write the results to a file called "countbase" $outputfile = "countbase"; ( unless open(COUNTBASE, ">$outputfile") || die ("Cannot open file \"$outputfile\" to write to!!\n\n"); print COUNTBASE "A=$a C=$c G=$g T=$t errors=$e\n"; close(COUNTBASE); .
Details
-
File Typepdf
-
Upload Time-
-
Content LanguagesEnglish
-
Upload UserAnonymous/Not logged-in
-
File Pages10 Page
-
File Size-