perlbioinformaticsgetopt

Running perl script and getting "Error in option spec:"


I am very new to running perl scripts and I am running into problems that I suspect are due to the assignment of variables.

Here is the beginning of the original script:

#!/usr/bin/perl -w

#####################################
# gbstrim.pl 
# John Garbe
# June 2016
# 
#
#####################################

=head1 NAME

gbstrim.pl - 
 - Trim padding and/or cut site residue from beginning of reads
 - Trim sequencing adapter from ends of reads
 - Trim all reads to length

NOTE: this doesn't handle paired-end data

=head1 SYNOPSIS

gbstrim.pl --enzyme1 bamhi --enzyme2 psti --read R1 [--threads 1] [--minlength 20] --fastqfile file.fastq --outputfile out.fastq 

=head1 DESCRIPTION

Options:
    --fastqfile sample.fastq : fastq file of reads to process, can be gz compressed
    --outputfile sample.gbstrim.fastq : fastq file of processed reads
    --enzyme1 bamhi : First restriciton enzyme used in library creation
    --enzyme2 psti : Second restriciton enzyme used in library creation
    --read R1 : Specify if the provided fastq file is the R1 or R2 read
    --minlength 20 : discard reads shorter than minlength (default: 20)
    --maxlength 95 : trim all reads longer than maxlength to maxlength (default: readlength - length of longest padding sequence)
    --croplength 95 : trim down longer reads and discard shorter reads
    --removecutsite : trim off cut sites as well
    --threads 1 : Number of cpu cores cutadapt should use (default: 1)
    --verbose : Print additional details while running
    --help : Display usage information

Advanced options:
    --r1padding C,TG,AAG,GCTC : comma-separated list of 5' padding sequences. To include a pad of zero start with a comma: ,C,TG,AAG,GCTC
    --r2padding C,TG,AAG,GCTC : comma-separated list of 3' padding sequences. To include a pad of zero start with a comma: ,C,TG,AAG,GCTC
    --r1overhang AGCT : Overhang sequence left by first restriction enzyme
    --r2overhang AGCT : Overhang sequence left by second restriction enzyme
    --adapter CTGTCTCTTATACACATCTCCGAG : sequencing primer adapter to trim from 3' end of reads
    --debug : Print output useful for debugging
=cut

##################### Initialize ###############################

use Getopt::Long;
use Pod::Usage;
use FindBin;
use lib "$FindBin::RealBin";

# set defaults
$r1adapter = "CTGTCTCTTATACACATCTCCGAG";
$r2adapter = "CTGTCTCTTATACACATCTGACGC";
$minlength = 20;
GetOptions("help" => \$help,
       "verbose" => \$verbose,
       "threads=i" => \$threads,
       "fastqfile=s" => \$fastqfile,
       "outputfile=s" => \$outputfile,
       "enzyme1=s" => \$r1enzyme,
       "enzyme2=s" => \$r2enzyme,
       "read=s" => \$read,
       "croplength=i" => \$croplength,
       "removecutsite" => \$removecutsite,

       # advanced options
       "r1overhang=s" => \$r1overhang,
       "r2overhang=s" => \$r2overhang,
       "r1padding=s" => \$r1padding,
       "r2padding=s" => \$r2padding,
       "debug" => \$debug,
       "minlength=i" => \$minlength,
       "maxlength=i" => \$maxlength,
       "r1adapter=s" => \$r1adapter,
       "r2adapter=s" => \$r2adapter,
    ) or pod2usage;
pod2usage(q(-verbose) => 3) if ($help);
if ($#ARGV >= 0) {
    print "Unknown commandline parameters: @ARGV\n";
    pod2usage;
}
if (! ($fastqfile and $outputfile and $read and $r1enzyme)) {
    print "--fastqfile, --outputfile, --read, --enzyme1 and enzyme2 are required\n";
    pod2usage;
}
die "Cannot find fastq file $fastqfile\n" if (! -e $fastqfile);

This is the script I attempted to run after editing and specifying my input files and enzymes. The input file I am working with is "test.fastq.gz"


#!/usr/bin/perl -w

#####################################
# gbstrim.pl 
# John Garbe
# June 2016
# 
#
#####################################

=head1 NAME

gbstrim.pl - 
 - Trim padding and/or cut site residue from beginning of reads
 - Trim sequencing adapter from ends of reads
 - Trim all reads to length

NOTE: this doesn't handle paired-end data

=head1 SYNOPSIS

gbstrim.pl --enzyme1 bamhi --enzyme2 nsii --read R1 [--threads 1] [--minlength 20] --fastqfile test.fastq.gz --outputfile trim_test.fastq.gz

=head1 DESCRIPTION

Options:
    --fastqfile test.fastq.gz : fastq file of reads to process, can be gz compressed
    --outputfile outputfile trim_test.fastq.gz : fastq file of processed reads
    --enzyme1 bamhi : First restriciton enzyme used in library creation
    --enzyme2 nsii : Second restriciton enzyme used in library creation
    --read R1 : Specify if the provided fastq file is the R1 or R2 read
    --minlength 20 : discard reads shorter than minlength (default: 20)
    --maxlength 95 : trim all reads longer than maxlength to maxlength (default: readlength - length of longest padding sequence)
    --croplength 95 : trim down longer reads and discard shorter reads
    --removecutsite : trim off cut sites as well
    --threads 1 : Number of cpu cores cutadapt should use (default: 1)
    --verbose : Print additional details while running
    --help : Display usage information

Advanced options:
    --r1padding C,TG,AAG,GCTC : comma-separated list of 5' padding sequences. To include a pad of zero start with a comma: ,C,TG,AAG,GCTC
    --r2padding C,TG,AAG,GCTC : comma-separated list of 3' padding sequences. To include a pad of zero start with a comma: ,C,TG,AAG,GCTC
    --r1overhang AGCT : Overhang sequence left by first restriction enzyme
    --r2overhang AGCT : Overhang sequence left by second restriction enzyme
    --adapter CTGTCTCTTATACACATCTCCGAG : sequencing primer adapter to trim from 3' end of reads
    --debug : Print output useful for debugging
=cut

##################### Initialize ###############################

use Getopt::Long;
use Pod::Usage;
use FindBin;
use lib "$FindBin::RealBin";

# set defaults
$r1adapter = "CTGTCTCTTATACACATCTCCGAG";
$r2adapter = "CTGTCTCTTATACACATCTGACGC";
$minlength = 20;
GetOptions("help" => \$help,
       "verbose" => \$verbose,
       "threads=i" => \$threads,
       "test.fastq.gz=s" => \$fastqfile,
       "trim_test.fastq.gz=s" => \$outputfile,
       "bamhi=s" => \$r1enzyme,
       "nsii=s" => \$r2enzyme,
       "R1" => \$read,
       "croplength=i" => \$croplength,
       "removecutsite" => \$removecutsite,

       # advanced options
       "r1overhang=s" => \$r1overhang,
       "r2overhang=s" => \$r2overhang,
       "r1padding=s" => \$r1padding,
       "r2padding=s" => \$r2padding,
       "debug" => \$debug,
       "minlength=i" => \$minlength,
       "maxlength=i" => \$maxlength,
       "r1adapter=s" => \$r1adapter,
       "r2adapter=s" => \$r2adapter,
    ) or pod2usage;
pod2usage(q(-verbose) => 3) if ($help);
if ($#ARGV >= 0) {
    print "Unknown commandline parameters: @ARGV\n";
    pod2usage;
}
if (! ($fastqfile and $outputfile and $read and $r1enzyme)) {
    print "--fastqfile, --outputfile, --read, --enzyme1 and enzyme2 are required\n";
    pod2usage;
}
die "Cannot find fastq file $fastqfile\n" if (! -e $fastqfile);

This is the error message I received:

Error in option spec: "test.fastq.gz=s"

Error in option spec: "trim_test.fastq.gz=s"


Solution

  • You have to run the original file, without your changes. When you run it from the command line, specify the values for the options, much like you do with any UNIX command, for example:

    gbstrim.pl --enzyme1 bamhi --enzyme2 nsii --read R1 --fastqfile test.fastq.gz --outputfile trim_test.fastq.gz
    

    Do not change the options inside the script.