#!/usr/bin/perl -T ################################################################## # Validation format for PCM1 submission # # Number of rows: 38 + 1 header row # # Number of columns: 9 # ################################################################## use strict; use warnings; my $usage=" # This script is used for PCM1 challenge of CAGI-5. # Please run this script before your submission to ensure the data format of submitted file is correct! # Data format # tab-seperated 9 columns file: # 1. Nucleotide position: DNA coding change of the variant (e.g., c.G17A) on the NM_001315507 transcript. # 2. Variant: protein change of the variant (e.g. p.G6D), on UniProtKB protein - Q15154 (PCM1_HUMAN) # 3. p-value relative of change from MO: The probability that this variant is statistically different from MO. # 4. Standard deviation: This defines the confidence of the prediction in column 3. Large SD means low confidence, while small SD means that the predictor is confident about the submitted prediction. # 5.p-value of change from MO+WT: The probability that this variant is statistically different from MO+WT # 6. Standard deviation: This defines the confidence of the prediction in column 5. Large SD means low confidence, while small SD means that the predictor is confident about the submitted prediction. # 7. Functional effect: pathogenic (2), hypomorphic (1), or benign (0) # 8. Confidence: in the functional effect assignment ranges 0.0 to 1.0. (1.0 implies total confidence in the assignment) # 9. Comments: Optional brief comments based on the predictions # Leave a \"*\" if you have no predictions/commets. Usage: perl $0 Thank you for taking part in CAGI 5! Good luck! "; die $usage if @ARGV!=1; my $num_rows = 39; # including header my $num_columns = 9; my $header="Nucleotide_position\tVariant\tP-value_MO\tStandard_deviation\tP-value_MO+WT\tStandard_deviation\tFunctional_effect\tConfidence:\tComments"; my $filename = $ARGV[0]; #if ($filename =~ /^([A-Za-z\_\-\.0-9]+)$/){ if ($filename =~ /^([A-Za-z\-\_\.\/0-9]+)$/){ $filename = $1; } else { die "Only alphabet, numbers, \"_\", \"-\" and \".\" are allowed in the file name\n"; } open(IFILE, $filename) or die "Cannot open input file: $!\n"; my @contents = ; chomp @contents; close(IFILE); my @errors = (); push @errors, "The number of lines does not match the template (1 header + 38 data rows)!\n" if @contents != $num_rows; my $h=shift @contents; push @errors, "Header should be exact \"$header\".\n" if $h ne $header; for(my $line=1;$line<=@contents;$line++){ my @cols = split "\t",$contents[$line-1]; push @errors,"The number of columns at line $line should be exact $num_columns. Please indicate the empty columns with '*'.\n" if @cols < $num_columns; push @errors,"The number of columns at line $line should be exact $num_columns. Tabs can only be used as column separators.\n" if @cols > $num_columns; #push @errors, "Invalid Nucleotide Position: $cols[0] at line $line. Nucleotide Position should be the same as in the test data.\n" unless $cols[0]=~/^[ACTG]{1}\d+[ACTG]{1}$/; #push @errors, "Invalid Variant: $cols[0] at line $line. Variant should be the same as in the test data.\n" unless $cols[1]=~/^[A-Z]{1}\d+[A-Z]{1}$/; # unless($cols[1] eq "*"){ # if($cols[1]=~/^[+-]?\d+\.?\d*$/){ # push @errors, "The 2nd column (relative activity) at line $line should be a real number; leave a '*' to indicate empty number.\n" if($cols[1]<0); # } else{ # push @errors, "The 2nd column (relative activity) at line $line should be a real number; leave a '*' to indicate empty number.\n"; # } # } unless($cols[2] eq "*"){ if($cols[2]=~/^[+-]?\d+\.?\d*$/){ push @errors, "The 3rd column (p-value relative of change from MO) at line $line should be a number; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); } else{ push @errors, "The 3rd column (p-value relative of change from MO) at line $line should be a real number; leave a '*' to indicate empty number.\n"; } } unless($cols[3] eq "*"){ if($cols[3]=~/^[+-]?\d+\.?\d*$/){ push @errors, "The 4th column (SD) at line $line should be a real number between 0 and 1; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); } else{ push @errors, "The 4th column (SD) at line $line should be a real number; leave a '*' to indicate empty number.\n"; } } unless($cols[4] eq "*"){ if($cols[4]=~/^[+-]?\d+\.?\d*$/){ push @errors, "The 5th column (p-value of change from MO+WT) at line $line should be a number; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); } else{ push @errors, "The 5th column (p-value of change from MO+WT) at line $line should be a real number; leave a '*' to indicate empty number.\n"; } } unless($cols[5] eq "*"){ if($cols[5]=~/^[+-]?\d+\.?\d*$/){ push @errors, "The 6th column (SD) at line $line should be a real number between 0 and 1; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); } else{ push @errors, "The 6th column (SD) at line $line should be a real number; leave a '*' to indicate empty number.\n"; } } # unless($cols[6] eq "*"){ # if($cols[6]=~/^[+-]?\d+\.?\d*$/){ # push @errors, "The 7th column (p-value relative of change from MO) at line $line should be a real number; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); # } else{ # push @errors, "The 7th column (p-value relative of change from MO) at line $line should be a real number; leave a '*' to indicate empty number.\n"; # } #} unless($cols[7] eq "*"){ if($cols[7]=~/^[+-]?\d+\.?\d*$/){ push @errors, "The 8th column (Confidence) at line $line should be a number; leave a '*' to indicate empty number.\n" if($cols[3]>1 || $cols[3]<0); } else{ push @errors, "The 8th column (Confidence) at line $line should be a real number; leave a '*' to indicate empty number.\n"; } } unless($cols[8] eq "*"){ if($cols[8]=~/^\w+$/){ push @errors, "The 9th column (Comments) at line $line should be any text; leave a '*' to indicate empty.\n" if($cols[5]=~/^\W+$/); #non word characters will error } } } if (@errors > 0) { print "Validation failed with errors:\n"; print @errors; exit -1; } else { print "Congratulations! Your submitted file has a validated format.\n"; if ((-x "/usr/bin/md5sum") && (-x "/usr/bin/cut")) { print "MD5 hash for your file is "; $ENV{PATH} = ""; system ("/usr/bin/md5sum $filename | /usr/bin/cut -c 1-32"); #print "\n"; if ((-r "/data/ajitha/checks/cagi.txt") && (-x "/bin/cat")) { print "Receipt code for your file is "; system ("/bin/cat /data/ajitha/checks/cagi.txt $filename | /usr/bin/md5sum | /usr/bin/cut -c 1-32"); } } exit 0; }