.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.28) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{ . if \nF \{ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Text::Affixes 3pm" .TH Text::Affixes 3pm "2015-11-01" "perl v5.20.2" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" Text::Affixes \- Prefixes and suffixes analysis of text .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 3 \& use Text::Affixes; \& my $text = "Hello, world. Hello, big world."; \& my $prefixes = get_prefixes($text); \& \& # $prefixes now holds \& # { \& # 3 => { \& # \*(AqHel\*(Aq => 2, \& # \*(Aqwor\*(Aq => 2, \& # } \& # } \& \& # or \& \& $prefixes = get_prefixes({min => 1, max => 2},$text); \& \& # $prefixes now holds \& # { \& # 1 => { \& # \*(AqH\*(Aq => 2, \& # \*(Aqw\*(Aq => 2, \& # \*(Aqb\*(Aq => 1, \& # }, \& # 2 => { \& # \*(AqHe\*(Aq => 2, \& # \*(Aqwo\*(Aq => 2, \& # \*(Aqbi\*(Aq => 1, \& # } \& # } \& \& # the use for get_suffixes is similar .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" Provides methods for prefix and suffix analysis of text. .SH "METHODS" .IX Header "METHODS" .SS "get_prefixes" .IX Subsection "get_prefixes" Extracts prefixes from text. You can specify the minimum and maximum number of characters of prefixes you want. .PP Returns a reference to a hash, where the specified limits are mapped in hashes; each of those hashes maps every prefix in the text into the number of times it was found. .PP By default, both minimum and maximum limits are 3. If the minimum limit is greater than the lower one, an empty hash is returned. .PP A prefix is considered to be a sequence of word characters (\ew) in the beginning of a word (that is, after a word boundary) that does not reach the end of the word (\*(L"regular expressionly\*(R", a prefix is the \f(CW$1\fR of /\eb(\ew+)\ew/). .PP .Vb 2 \& # extracting prefixes of size 3 \& $prefixes = get_prefixes( $text ); \& \& # extracting prefixes of sizes 2 and 3 \& $prefixes = get_prefixes( {min => 2}, $text ); \& \& # extracting prefixes of sizes 3 and 4 \& $prefixes = get_prefixes( {max => 4}, $text ); \& \& # extracting prefixes of sizes 2, 3 and 4 \& $prefixes = get_prefixes( {min => 2, max=> 4}, $text); .Ve .SS "get_suffixes" .IX Subsection "get_suffixes" The get_suffixes function is similar to the get_prefixes one. You should read the documentation for that one and than come back to this point. .PP A suffix is considered to be a sequence of word characters (\ew) in the end of a word (that is, before a word boundary) that does not start at the beginning of the word (\*(L"regular expressionly\*(R" speaking, a suffix is the \f(CW$1\fR of /\ew(\ew+)\eb/). .PP .Vb 2 \& # extracting suffixes of size 3 \& $suffixes = get_suffixes( $text ); \& \& # extracting suffixes of sizes 2 and 3 \& $suffixes = get_suffixes( {min => 2}, $text ); \& \& # extracting suffixes of sizes 3 and 4 \& $suffixes = get_suffixes( {max => 4}, $text ); \& \& # extracting suffixes of sizes 2, 3 and 4 \& $suffixes = get_suffixes( {min => 2, max=> 4}, $text); .Ve .SH "OPTIONS" .IX Header "OPTIONS" Apart from deciding on a minimum and maximum size for prefixes or suffixes, you can also decide on some configuration options. .SS "exclude_numbers" .IX Subsection "exclude_numbers" Set to 0 if you consider numbers as part of words. Default value is 1. .PP .Vb 2 \& # this \& get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" ); \& \& # returns this: \& { \& 1 => { \& \*(Aqo\*(Aq => 1, \& \*(Aqt\*(Aq => 1, \& \*(Aq8\*(Aq => 1 \& } \& } .Ve .SS "lowercase" .IX Subsection "lowercase" Set to 1 to extract all prefixes in lowercase mode. Default value is 0. .PP \&\s-1ATTENTION:\s0 This does not mean that prefixes with uppercased characters won't be extracted. It means they will be extracted after being lowercased. .PP .Vb 2 \& # this... \& get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello"); \& \& # returns this: \& { \& 2 => { \& \*(Aqhe\*(Aq => 2 \& } \& } .Ve .SH "TO DO" .IX Header "TO DO" .IP "\(bu" 6 Make it more efficient (use C for that) .SH "AUTHOR" .IX Header "AUTHOR" Jose Castro, \f(CW\*(C`\*(C'\fR .SH "COPYRIGHT & LICENSE" .IX Header "COPYRIGHT & LICENSE" Copyright 2004 Jose Castro, All Rights Reserved. .PP This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.