.\" Automatically generated by Pod::Man 4.11 (Pod::Simple 3.35) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is >0, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{\ . if \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{\ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Genome::Model::Tools::Music::Smg 3pm" .TH Genome::Model::Tools::Music::Smg 3pm "2020-11-06" "perl v5.30.3" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .IP "\-\-bmr\-modifier\-file" 4 .IX Item "--bmr-modifier-file" .RS 4 .PD 0 .IP "The user can provide a \s-1BMR\s0 modifier for each gene in the \s-1ROI\s0 file, which is a multiplier for the categorized background mutation rates, before testing them against the gene's categorized mutation rates. Such a file can be used to correct for regional or systematic bias in mutation rates across the genome that may be correlated to CpG deamination or \s-1DNA\s0 repair processes like transcription-coupled repair or mismatch repair. Mutation rates have also been associated with \s-1DNA\s0 replication timing, where higher mutation rates are seen in late replicating regions. Note that the same per-gene multiplier is used on each mutation category of \s-1BMR.\s0 Any genes from the \s-1ROI\s0 file that are not in the \s-1BMR\s0 modifier file will be tested against unmodified overall BMRs per mutation category. \s-1BMR\s0 modifiers of <=0 are not permitted, because that's just silly." 8 .IX Item "The user can provide a BMR modifier for each gene in the ROI file, which is a multiplier for the categorized background mutation rates, before testing them against the gene's categorized mutation rates. Such a file can be used to correct for regional or systematic bias in mutation rates across the genome that may be correlated to CpG deamination or DNA repair processes like transcription-coupled repair or mismatch repair. Mutation rates have also been associated with DNA replication timing, where higher mutation rates are seen in late replicating regions. Note that the same per-gene multiplier is used on each mutation category of BMR. Any genes from the ROI file that are not in the BMR modifier file will be tested against unmodified overall BMRs per mutation category. BMR modifiers of <=0 are not permitted, because that's just silly." .RE .RS 4 .RE .IP "\-\-skip\-low\-mr\-genes" 4 .IX Item "--skip-low-mr-genes" .RS 4 .IP "Genes with consistently lower MRs than the BMRs across mutation categories, may show up in the results as an \s-1SMG\s0 (by \s-1CT\s0 or \s-1LRT\s0). If such genes are not of interest, they may be assigned a p\-value of 1. This should also speed things up. Genes with higher Indel or Truncation rates than the background will not be skipped even if the gene's overall \s-1MR\s0 is lower than the \s-1BMR.\s0 If bmr-modifiers are applied, this step uses the modified BMRs instead." 8 .IX Item "Genes with consistently lower MRs than the BMRs across mutation categories, may show up in the results as an SMG (by CT or LRT). If such genes are not of interest, they may be assigned a p-value of 1. This should also speed things up. Genes with higher Indel or Truncation rates than the background will not be skipped even if the gene's overall MR is lower than the BMR. If bmr-modifiers are applied, this step uses the modified BMRs instead." .RE .RS 4 .RE .PD .PP \&\s-1EOS\s0 ); } .PP sub _doc_authors { return <<\s-1EOS\s0 Qunyuan Zhang, Ph.D. Cyriac Kandoth, Ph.D. Nathan D. Dees, Ph.D. \&\s-1EOS\s0 } .PP sub execute { my \f(CW$self\fR = shift; my \f(CW$gene_mr_file\fR = \f(CW$self\fR\->gene_mr_file; my \f(CW$output_file\fR = \f(CW$self\fR\->output_file; my \f(CW$output_file_detailed\fR = \f(CW$output_file\fR . \*(L"_detailed\*(R"; my \f(CW$max_fdr\fR = \f(CW$self\fR\->max_fdr; my \f(CW$skip_low_mr_genes\fR = \f(CW$self\fR\->skip_low_mr_genes; my \f(CW$bmr_modifier_file\fR = \f(CW$self\fR\->bmr_modifier_file; my \f(CW$processors\fR = \f(CW$self\fR\->processors; .PP .Vb 4 \& # Check on all the input data before starting work \& print STDERR "Gene mutation rate file not found or is empty: $gene_mr_file\en" unless( \-s $gene_mr_file ); \& print STDERR "BMR modifier file not found or is empty: $bmr_modifier_file\en" unless( !defined $bmr_modifier_file || \-s $bmr_modifier_file ); \& return undef unless( \-s $gene_mr_file && ( !defined $bmr_modifier_file || \-s $bmr_modifier_file )); \& \& # If BMR modifiers were provided, then load them, and create another gene_mr_file with modified BMRs \& if( defined $bmr_modifier_file ) { \& my $inBmrModFh = IO::File\->new( $bmr_modifier_file ) or die "Couldn\*(Aqt open $bmr_modifier_file. $!\en"; \& my %bmr_modifier = (); \& while( my $line = $inBmrModFh\->getline ) { \& next if( $line =~ m/^#/ ); \& chomp( $line ); \& my ( $gene, $modifier ) = split( /\et/, $line ); \& ( $modifier > 0 ) or die "$modifier is an invalid bmr\-modifier. Please fix values in $bmr_modifier_file.\en"; \& $bmr_modifier{$gene} = $modifier; \& } \& $inBmrModFh\->close; \& \& my $new_gene_mr_file = Genome::Sys\->create_temp_file_path; \& ( $new_gene_mr_file ) or die "Couldn\*(Aqt create a temp file. $!"; \& my $inMrFh = IO::File\->new( $gene_mr_file ) or die "Couldn\*(Aqt open $gene_mr_file. $!\en"; \& my $outMrFh = IO::File\->new( $new_gene_mr_file, ">" ) or die "Couldn\*(Aqt open $new_gene_mr_file. $!\en"; \& while( my $line = $inMrFh\->getline ) { \& if( $line =~ m/^#/ ) { \& $outMrFh\->print( $line ); \& next; \& } \& chomp( $line ); \& my ( $gene, $type, $covd_bps, $mut_cnt, $bmr ) = split( /\et/, $line ); \& $bmr = $bmr * $bmr_modifier{$gene} if( defined $bmr_modifier{$gene} ); \& $outMrFh\->print( "$gene\et$type\et$covd_bps\et$mut_cnt\et$bmr\en" ); \& } \& $outMrFh\->close; \& $inMrFh\->close; \& \& $gene_mr_file = $new_gene_mr_file; \& } \& \& # Collect per\-gene mutation rates for reporting in results later \& my ( %gene_muts, %gene_bps, %mut_classes_hash ); \& my $inMrFh = IO::File\->new( $gene_mr_file ) or die "Couldn\*(Aqt open $gene_mr_file. $!\en"; \& while( my $line = $inMrFh\->getline ) { \& next if( $line =~ m/^#/ ); \& my ( $gene, $type, $covd_bps, $mut_cnt, undef ) = split( /\et/, $line ); \& \& # Warn user about cases where there could be fewer covered bps than mutations detected \& ( $mut_cnt <= $covd_bps ) or warn "More $type seen in $gene than there are bps with sufficient coverage!\en"; \& \& if( $type eq "Overall" or $type eq "Indels" or $type eq "Truncations" ) { \& $gene_muts{$gene}{$type} = $mut_cnt; \& $gene_bps{$gene} = $covd_bps; \& $mut_classes_hash{$type} = 1 unless( $type eq "Overall" ); \& } \& elsif( $type =~ m/(Transitions|Transversions)$/ ) { \& $gene_muts{$gene}{SNVs} += $mut_cnt; \& $mut_classes_hash{SNVs} = 1; \& } \& else { \& die "Unrecognized mutation class in gene\-mr\-file. $!\en"; \& } \& } \& $inMrFh\->close; \& my @mut_classes = sort keys %mut_classes_hash; \& \& # Create a temporary intermediate file to hold the p\-values \& my $pval_file = Genome::Sys\->create_temp_file_path; \& ( $pval_file ) or die "Couldn\*(Aqt create a temp file. $!"; \& \& # Call R for Fisher combined test, Likelihood ratio test, and convolution test on each gene \& my $smg_cmd = "R \-\-slave \-\-args < " . _\|_FILE_\|_ . ".R $gene_mr_file $pval_file smg_test $processors $skip_low_mr_genes"; \& WIFEXITED( system $smg_cmd ) or croak "Couldn\*(Aqt run: $smg_cmd ($?)"; \& \& # Call R for calculating FDR on the p\-values calculated in the SMG test \& my $fdr_cmd = "R \-\-slave \-\-args < " . _\|_FILE_\|_ . ".R $pval_file $output_file_detailed calc_fdr $processors $skip_low_mr_genes"; \& WIFEXITED( system $fdr_cmd ) or croak "Couldn\*(Aqt run: $fdr_cmd ($?)"; \& \& # Parse the R output to identify the SMGs (significant by at least 2 of 3 tests) \& my $smgFh = IO::File\->new( $output_file_detailed ) or die "Couldn\*(Aqt open $output_file_detailed. $!\en"; \& my ( @newLines, @smgLines ); \& my $header = "#Gene\et" . join( "\et", @mut_classes ); \& $header .= "\etTot Muts\etCovd Bps\etMuts pMbp\etP\-value FCPT\etP\-value LRT\etP\-value CT\etFDR FCPT\etFDR LRT\etFDR CT\en"; \& while( my $line = $smgFh\->getline ) { \& chomp( $line ); \& if( $line =~ m/^Gene\etp.fisher\etp.lr\etp.convol\etfdr.fisher\etfdr.lr\etfdr.convol$/ ) { \& push( @newLines, $header ); \& push( @smgLines, $header ); \& } \& else { \& my ( $gene, @pq_vals ) = split( /\et/, $line ); \& my ( $p_fcpt, $p_lrt, $p_ct, $q_fcpt, $q_lrt, $q_ct ) = @pq_vals; \& my @mut_cnts; \& foreach( @mut_classes ) { \& # If a mutation count is a fraction, round down the digits after the decimal point \& push( @mut_cnts, (( $gene_muts{$gene}{$_} =~ m/\e./ ) ? sprintf( "%.2f", $gene_muts{$gene}{$_} ) : $gene_muts{$gene}{$_} )); \& } \& my $mut_per_mbp = ( $gene_bps{$gene} ? sprintf( "%.2f", ( $gene_muts{$gene}{Overall} / $gene_bps{$gene} * 1000000 )) : 0 ); \& push( @newLines, join( "\et", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\en" ); \& \& # If the FDR of at least two of these tests is less than the maximum allowed, we consider it an SMG \& if(( $q_fcpt <= $max_fdr && $q_lrt <= $max_fdr ) || ( $q_fcpt <= $max_fdr && $q_ct <= $max_fdr ) || \& ( $q_lrt <= $max_fdr && $q_ct <= $max_fdr )) { \& push( @smgLines, join( "\et", $gene, @mut_cnts, $gene_muts{$gene}{Overall}, $gene_bps{$gene}, $mut_per_mbp, @pq_vals ) . "\en" ); \& } \& } \& } \& $smgFh\->close; \& \& # Add per\-gene SNV and Indel counts to the detailed R output, and make the header friendlier \& my $outDetFh = IO::File\->new( $output_file_detailed, ">" ) or die "Couldn\*(Aqt open $output_file_detailed. $!\en"; \& $outDetFh\->print( @newLines ); \& $outDetFh\->close; \& \& # Do the same for only the genes that we consider SMGs \& my $outFh = IO::File\->new( $output_file, ">" ) or die "Couldn\*(Aqt open $output_file. $!\en"; \& $outFh\->print( @smgLines ); \& $outFh\->close; \& \& return 1; \&} .Ve .PP 1;