.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.43) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is >0, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{\ . if \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{\ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" ======================================================================== .\" .IX Title "MCE::Grep 3pm" .TH MCE::Grep 3pm "2023-09-29" "perl v5.36.0" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" MCE::Grep \- Parallel grep model similar to the native grep function .SH "VERSION" .IX Header "VERSION" This document describes MCE::Grep version 1.889 .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 2 \& ## Exports mce_grep, mce_grep_f, and mce_grep_s \& use MCE::Grep; \& \& ## Array or array_ref \& my @a = mce_grep { $_ % 5 == 0 } 1..10000; \& my @b = mce_grep { $_ % 5 == 0 } \e@list; \& \& ## Important; pass an array_ref for deeply input data \& my @c = mce_grep { $_\->[1] % 2 == 0 } [ [ 0, 1 ], [ 0, 2 ], ... ]; \& my @d = mce_grep { $_\->[1] % 2 == 0 } \e@deeply_list; \& \& ## File path, glob ref, IO::All::{ File, Pipe, STDIO } obj, or scalar ref \& ## Workers read directly and not involve the manager process \& my @e = mce_grep_f { /pattern/ } "/path/to/file"; # efficient \& \& ## Involves the manager process, therefore slower \& my @f = mce_grep_f { /pattern/ } $file_handle; \& my @g = mce_grep_f { /pattern/ } $io; \& my @h = mce_grep_f { /pattern/ } \e$scalar; \& \& ## Sequence of numbers (begin, end [, step, format]) \& my @i = mce_grep_s { %_ * 3 == 0 } 1, 10000, 5; \& my @j = mce_grep_s { %_ * 3 == 0 } [ 1, 10000, 5 ]; \& \& my @k = mce_grep_s { %_ * 3 == 0 } { \& begin => 1, end => 10000, step => 5, format => undef \& }; .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" This module provides a parallel grep implementation via Many-Core Engine. \&\s-1MCE\s0 incurs a small overhead due to passing of data. A fast code block will run faster natively. However, the overhead will likely diminish as the complexity increases for the code. .PP .Vb 2 \& my @m1 = grep { $_ % 5 == 0 } 1..1000000; ## 0.065 secs \& my @m2 = mce_grep { $_ % 5 == 0 } 1..1000000; ## 0.194 secs .Ve .PP Chunking, enabled by default, greatly reduces the overhead behind the scene. The time for mce_grep below also includes the time for data exchanges between the manager and worker processes. More parallelization will be seen when the code incurs additional \s-1CPU\s0 time. .PP .Vb 2 \& my @m1 = grep { /[2357][1468][9]/ } 1..1000000; ## 0.353 secs \& my @m2 = mce_grep { /[2357][1468][9]/ } 1..1000000; ## 0.218 secs .Ve .PP Even faster is mce_grep_s; useful when input data is a range of numbers. Workers generate sequences mathematically among themselves without any interaction from the manager process. Two arguments are required for mce_grep_s (begin, end). Step defaults to 1 if begin is smaller than end, otherwise \-1. .PP .Vb 1 \& my @m3 = mce_grep_s { /[2357][1468][9]/ } 1, 1000000; ## 0.165 secs .Ve .PP Although this document is about MCE::Grep, the MCE::Stream module can write results immediately without waiting for all chunks to complete. This is made possible by passing the reference to an array (in this case \f(CW@m4\fR and \f(CW@m5\fR). .PP .Vb 1 \& use MCE::Stream default_mode => \*(Aqgrep\*(Aq; \& \& my @m4; mce_stream \e@m4, sub { /[2357][1468][9]/ }, 1..1000000; \& \& ## Completed in 0.203 secs. This is amazing considering the \& ## overhead for passing data between the manager and workers. \& \& my @m5; mce_stream_s \e@m5, sub { /[2357][1468][9]/ }, 1, 1000000; \& \& ## Completed in 0.120 secs. Like with mce_grep_s, specifying a \& ## sequence specification turns out to be faster due to lesser \& ## overhead for the manager process. .Ve .PP A common scenario is grepping for pattern(s) inside a massive log file. Notice how parallelism increases as complexity increases for the pattern. Testing was done against a 300 \s-1MB\s0 file containing 250k lines. .PP .Vb 1 \& use MCE::Grep; \& \& my @m; open my $LOG, "<", "/path/to/log/file" or die "$!\en"; \& \& @m = grep { /pattern/ } <$LOG>; ## 0.756 secs \& @m = grep { /foobar|[2357][1468][9]/ } <$LOG>; ## 24.681 secs \& \& ## Parallelism with mce_grep. This involves the manager process \& ## due to processing a file handle. \& \& @m = mce_grep { /pattern/ } <$LOG>; ## 0.997 secs \& @m = mce_grep { /foobar|[2357][1468][9]/ } <$LOG>; ## 7.439 secs \& \& ## Even faster with mce_grep_f. Workers access the file directly \& ## with zero interaction from the manager process. \& \& my $LOG = "/path/to/file"; \& @m = mce_grep_f { /pattern/ } $LOG; ## 0.112 secs \& @m = mce_grep_f { /foobar|[2357][1468][9]/ } $LOG; ## 6.840 secs .Ve .SH "PARSING HUGE FILES" .IX Header "PARSING HUGE FILES" The MCE::Grep module lacks an optimization for quickly determining if a match is found from not knowing the pattern inside the code block. Use the following snippet as a template to achieve better performance. Also, take a look at examples/egrep.pl, included with the distribution. .PP .Vb 1 \& use MCE::Loop; \& \& MCE::Loop\->init( \& max_workers => 8, use_slurpio => 1 \& ); \& \& my $pattern = \*(Aqkarl\*(Aq; \& my $hugefile = \*(Aqvery_huge.file\*(Aq; \& \& my @result = mce_loop_f { \& my ($mce, $slurp_ref, $chunk_id) = @_; \& \& ## Quickly determine if a match is found. \& ## Process slurped chunk only if true. \& \& if ($$slurp_ref =~ /$pattern/m) { \& my @matches; \& \& ## The following is fast on Unix. Performance degrades \& ## drastically on Windows beyond 4 workers. \& \& open my $MEM_FH, \*(Aq<\*(Aq, $slurp_ref; \& binmode $MEM_FH, \*(Aq:raw\*(Aq; \& while (<$MEM_FH>) { push @matches, $_ if (/$pattern/); } \& close $MEM_FH; \& \& ## Therefore, use the following construct on Windows. \& \& while ( $$slurp_ref =~ /([^\en]+\en)/mg ) { \& my $line = $1; # save $1 to not lose the value \& push @matches, $line if ($line =~ /$pattern/); \& } \& \& ## Gather matched lines. \& \& MCE\->gather(@matches); \& } \& \& } $hugefile; \& \& print join(\*(Aq\*(Aq, @result); .Ve .SH "OVERRIDING DEFAULTS" .IX Header "OVERRIDING DEFAULTS" The following list options which may be overridden when loading the module. .PP .Vb 3 \& use Sereal qw( encode_sereal decode_sereal ); \& use CBOR::XS qw( encode_cbor decode_cbor ); \& use JSON::XS qw( encode_json decode_json ); \& \& use MCE::Grep \& max_workers => 4, # Default \*(Aqauto\*(Aq \& chunk_size => 100, # Default \*(Aqauto\*(Aq \& tmp_dir => "/path/to/app/tmp", # $MCE::Signal::tmp_dir \& freeze => \e&encode_sereal, # \e&Storable::freeze \& thaw => \e&decode_sereal, # \e&Storable::thaw \& init_relay => 0, # Default undef; MCE 1.882+ \& use_threads => 0, # Default undef; MCE 1.882+ \& ; .Ve .PP From \s-1MCE 1.8\s0 onwards, Sereal 3.015+ is loaded automatically if available. Specify \f(CW\*(C`Sereal => 0\*(C'\fR to use Storable instead. .PP .Vb 1 \& use MCE::Grep Sereal => 0; .Ve .SH "CUSTOMIZING MCE" .IX Header "CUSTOMIZING MCE" .IP "MCE::Grep\->init ( options )" 3 .IX Item "MCE::Grep->init ( options )" .PD 0 .IP "MCE::Grep::init { options }" 3 .IX Item "MCE::Grep::init { options }" .PD .PP The init function accepts a hash of \s-1MCE\s0 options. The gather option, if specified, is ignored due to being used internally by the module. .PP .Vb 1 \& use MCE::Grep; \& \& MCE::Grep\->init( \& chunk_size => 1, max_workers => 4, \& \& user_begin => sub { \& print "## ", MCE\->wid, " started\en"; \& }, \& \& user_end => sub { \& print "## ", MCE\->wid, " completed\en"; \& } \& ); \& \& my @a = mce_grep { $_ % 5 == 0 } 1..100; \& \& print "\en", "@a", "\en"; \& \& \-\- Output \& \& ## 2 started \& ## 3 started \& ## 1 started \& ## 4 started \& ## 3 completed \& ## 4 completed \& ## 1 completed \& ## 2 completed \& \& 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 .Ve .SH "API DOCUMENTATION" .IX Header "API DOCUMENTATION" .IP "MCE::Grep\->run ( sub { code }, list )" 3 .IX Item "MCE::Grep->run ( sub { code }, list )" .PD 0 .IP "mce_grep { code } list" 3 .IX Item "mce_grep { code } list" .PD .PP Input data may be defined using a list or an array reference. Unlike MCE::Loop, Flow, and Step, specifying a hash reference as input data isn't allowed. .PP .Vb 3 \& ## Array or array_ref \& my @a = mce_grep { /[2357]/ } 1..1000; \& my @b = mce_grep { /[2357]/ } \e@list; \& \& ## Important; pass an array_ref for deeply input data \& my @c = mce_grep { $_\->[1] =~ /[2357]/ } [ [ 0, 1 ], [ 0, 2 ], ... ]; \& my @d = mce_grep { $_\->[1] =~ /[2357]/ } \e@deeply_list; \& \& ## Not supported \& my @z = mce_grep { ... } \e%hash; .Ve .IP "MCE::Grep\->run_file ( sub { code }, file )" 3 .IX Item "MCE::Grep->run_file ( sub { code }, file )" .PD 0 .IP "mce_grep_f { code } file" 3 .IX Item "mce_grep_f { code } file" .PD .PP The fastest of these is the /path/to/file. Workers communicate the next offset position among themselves with zero interaction by the manager process. .PP \&\f(CW\*(C`IO::All\*(C'\fR { File, Pipe, \s-1STDIO\s0 } is supported since \s-1MCE 1.845.\s0 .PP .Vb 4 \& my @c = mce_grep_f { /pattern/ } "/path/to/file"; # faster \& my @d = mce_grep_f { /pattern/ } $file_handle; \& my @e = mce_grep_f { /pattern/ } $io; # IO::All \& my @f = mce_grep_f { /pattern/ } \e$scalar; .Ve .ie n .IP "MCE::Grep\->run_seq ( sub { code }, $beg, $end [, $step, $fmt ] )" 3 .el .IP "MCE::Grep\->run_seq ( sub { code }, \f(CW$beg\fR, \f(CW$end\fR [, \f(CW$step\fR, \f(CW$fmt\fR ] )" 3 .IX Item "MCE::Grep->run_seq ( sub { code }, $beg, $end [, $step, $fmt ] )" .PD 0 .ie n .IP "mce_grep_s { code } $beg, $end [, $step, $fmt ]" 3 .el .IP "mce_grep_s { code } \f(CW$beg\fR, \f(CW$end\fR [, \f(CW$step\fR, \f(CW$fmt\fR ]" 3 .IX Item "mce_grep_s { code } $beg, $end [, $step, $fmt ]" .PD .PP Sequence may be defined as a list, an array reference, or a hash reference. The functions require both begin and end values to run. Step and format are optional. The format is passed to sprintf (% may be omitted below). .PP .Vb 1 \& my ($beg, $end, $step, $fmt) = (10, 20, 0.1, "%4.1f"); \& \& my @f = mce_grep_s { /[1234]\e.[5678]/ } $beg, $end, $step, $fmt; \& my @g = mce_grep_s { /[1234]\e.[5678]/ } [ $beg, $end, $step, $fmt ]; \& \& my @h = mce_grep_s { /[1234]\e.[5678]/ } { \& begin => $beg, end => $end, \& step => $step, format => $fmt \& }; .Ve .IP "MCE::Grep\->run ( sub { code }, iterator )" 3 .IX Item "MCE::Grep->run ( sub { code }, iterator )" .PD 0 .IP "mce_grep { code } iterator" 3 .IX Item "mce_grep { code } iterator" .PD .PP An iterator reference may be specified for input_data. Iterators are described under section \*(L"\s-1SYNTAX\s0 for \s-1INPUT_DATA\*(R"\s0 at MCE::Core. .PP .Vb 1 \& my @a = mce_grep { $_ % 3 == 0 } make_iterator(10, 30, 2); .Ve .SH "MANUAL SHUTDOWN" .IX Header "MANUAL SHUTDOWN" .IP "MCE::Grep\->finish" 3 .IX Item "MCE::Grep->finish" .PD 0 .IP "MCE::Grep::finish" 3 .IX Item "MCE::Grep::finish" .PD .PP Workers remain persistent as much as possible after running. Shutdown occurs automatically when the script terminates. Call finish when workers are no longer needed. .PP .Vb 1 \& use MCE::Grep; \& \& MCE::Grep\->init( \& chunk_size => 20, max_workers => \*(Aqauto\*(Aq \& ); \& \& my @a = mce_grep { ... } 1..100; \& \& MCE::Grep\->finish; .Ve .SH "INDEX" .IX Header "INDEX" \&\s-1MCE\s0, MCE::Core .SH "AUTHOR" .IX Header "AUTHOR" Mario E. Roy,