# -*- coding: utf-8 -*- """ ARSA Validation Script This script validates amino acid substitution submission files for the CAGI7 ARSA challenge. Usage: python3 arsavalidation.py Example: python3 arsavalidation.py arsasubmission.tsv arsavariantlist.txt Arguments: submission_file - Path to your submission file variant_list_file - Path to the variant list file (arsavariantlist.txt) """ import re import sys import os #%% Load variant list from file def load_variant_list(filename): """Load variant list from arsavariantlist.txt file""" if not os.path.exists(filename): print(f"Error: Variant list {filename} not found.") sys.exit(1) va_list = set() try: with open(filename, 'r') as f: for line in f: variant = line.strip() if variant: # Skip empty lines va_list.add(variant) except Exception as e: print(f"Error reading {filename}: {e}") sys.exit(1) return va_list #%% Validation if len(sys.argv) != 3: print("Usage: python arsavalidation.py ") print("Example: python arsavalidation.py arsasubmission.tsv /h/Desktop/arsavariantlist.txt") sys.exit(1) submission_file = sys.argv[1] variant_list_file = sys.argv[2] #%% Data collection va_list = load_variant_list(variant_list_file) print(f"Loaded {len(va_list)} variants from {variant_list_file}") # Extract amino acids for validation (optional) aa_list = set() variant_regex = re.compile('([A-Z])([0-9]+)([A-Z])') for va in va_list: m = variant_regex.match(va) if m: aa_list.add(m.group(1)) if not os.path.exists(submission_file): print(f"Error: Submission file '{submission_file}' not found.") sys.exit(1) # Regex to match the new format: aa_substitution, stability_score_48hr, sd, comment regex = re.compile(r'^[A-Z][0-9]+[A-Z]\t[0-9.e+-]+\t[0-9.e+-]+(\t.*)?$') # Lists to collect all errors errors = [] warnings = [] try: with open(submission_file, 'r') as file: header = file.readline().strip() # Check header format expected_header = "aa_substitution\tstability_score_48hr\tsd\tcomment" if not header.startswith("aa_substitution\tstability_score_48hr\tsd"): warnings.append("Header format may be incorrect.") warnings.append(f"Expected: {expected_header}") warnings.append(f"Found: {header}") linenum = 1 submitted_variants = set() duplicate_variants = set() for line in file: line = line.rstrip() linenum += 1 if not line: # Skip empty lines continue # Split the line arr = line.split('\t') # Check minimum column count if len(arr) < 3: errors.append(f'line:{linenum} is invalid,\n{line}') errors.append("Each line must have at least 3 columns: aa_substitution, stability_score_48hr, sd") continue if len(arr) == 3: va, score, std = arr comment = "" elif len(arr) >= 4: va, score, std = arr[0], arr[1], arr[2] comment = '\t'.join(arr[3:]) # Join remaining columns as comment # Validate variant format if not variant_regex.match(va): errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Variant "{va}" does not match expected format (e.g., R21S)') continue # Check if variant exists in the challenge if va not in va_list: errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Variant {va} does not appear in this challenge. Please check whether this is the correct file') continue # Check for duplicates if va in submitted_variants: if va not in duplicate_variants: # Only report each duplicate once errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Variant {va} appears multiple times in submission') duplicate_variants.add(va) continue submitted_variants.add(va) # Validate stability score try: score_val = float(score) except: errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Stability score value "{score}" is invalid float format') continue if not (0 <= score_val <= 1): errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Stability score value "{score_val}" is invalid. The value should be between 0 and 1 (where 0.4 represents 40% protein remaining)') continue # Validate standard deviation try: std_val = float(std) except: errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Standard deviation value "{std}" is invalid float format') continue if not (std_val >= 0): errors.append(f'line:{linenum} is invalid,\n{line}') errors.append(f'Standard deviation value "{std_val}" is invalid. The value should be non-negative') continue except Exception as e: print(f"Error reading submission file: {e}") sys.exit(1) # Print all warnings first if warnings: print("\n=== WARNINGS ===") for warning in warnings: print(warning) # Print all errors if errors: print("\n=== ERRORS FOUND ===") for error in errors: print(error) print(f"\nTotal errors found: {len(errors)//2}") # Divide by 2 since each error has 2 lines # Check if all variants are submitted missing_variants = va_list - submitted_variants if missing_variants: print(f"\n=== MISSING VARIANTS ===") print(f"{len(missing_variants)} variants are missing from your submission:") for variant in sorted(missing_variants): print(f" {variant}") print(f'\n=== VALIDATION SUMMARY ===') print(f'- Submitted variants: {len(submitted_variants)}') print(f'- Expected variants: {len(va_list)}') print(f'- Missing variants: {len(missing_variants)}') print(f'- Format errors: {len(errors)//2 if errors else 0}') if len(missing_variants) == 0 and not errors: print('\nThe file\'s format is valid and complete! You are good to submit now!') else: print('\nPlease fix all errors and add predictions for missing variants before submission.') sys.exit(1)