.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.43) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is >0, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{\ . if \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{\ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" ======================================================================== .\" .IX Title "URL::Search 3pm" .TH URL::Search 3pm "2023-03-22" "perl v5.36.0" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" URL::Search \- search for URLs in plain text .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 1 \& use URL::Search qw( $URL_SEARCH_RE extract_urls partition_urls ); \& \& if ($text =~ /($URL_SEARCH_RE)/) { \& print "the first URL in text was: $1\en"; \& } \& \& my @all_urls = extract_urls $text; .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" This module searches plain text for URLs and extracts them. It exports (on request) the following entities: .ie n .SS "$URL_SEARCH_RE" .el .SS "\f(CW$URL_SEARCH_RE\fP" .IX Subsection "$URL_SEARCH_RE" This variable is the core of this module. It contains a regex that matches a \s-1URL.\s0 .PP \&\s-1NOTE:\s0 This regex uses capturing groups internally, so if you embed it in a bigger pattern, the numbering of any following capture groups will be off. If this is an issue, use named capture groups of the form \f(CW\*(C`(?...)\*(C'\fR instead. See \*(L"Capture groups\*(R" in perlre. .PP It only matches URLs with an explicit schema (one of \f(CW\*(C`http\*(C'\fR or \f(CW\*(C`https\*(C'\fR). The pattern is deliberately not anchored at the beginning, i.e. it will match \&\f(CW\*(C`http://foo\*(C'\fR in \f(CW"click herehttp://foo"\fR. If you don't want that, use \&\f(CW\*(C`/\eb$URL_SEARCH_RE/\*(C'\fR. .PP It tries to exclude artifacts of the surrounding text: .PP .Vb 2 \& Is mayonnaise an instrument? (https://en.wikipedia.org/wiki/Instrument, \& https://en.wikipedia.org/wiki/Mayonnaise_(instrument)) .Ve .PP In this example it will match \f(CW\*(C`https://en.wikipedia.org/wiki/Instrument\*(C'\fR and \&\f(CW\*(C`https://en.wikipedia.org/wiki/Mayonnaise_(instrument)\*(C'\fR, without the comma after \*(L"Instrument\*(R" and the final closing parenthesis. .PP It understands all common \s-1URL\s0 elements: username, hostname, port, path, query string, fragment identifier. The hostname can be an \s-1IP\s0 address (IPv4 and IPv6 are both supported). .PP Unicode is supported (e.g. \f(CW\*(C`http://поддомен.example.com/déjà\-vu?utf8=✓\*(C'\fR is matched correctly). .ie n .SS """extract_urls""" .el .SS "\f(CWextract_urls\fP" .IX Subsection "extract_urls" This function takes a string and returns a list of all contained URLs. .PP It uses \f(CW$URL_SEARCH_RE\fR to find matches. .PP Example: .PP .Vb 3 \& my $text = \*(AqVisit us at http://html5zombo.com. Also, https://archive.org\*(Aq; \& my @urls = extract_urls $text; \& # @urls = (\*(Aqhttp://html5zombo.com\*(Aq, \*(Aqhttps://archive.org\*(Aq) .Ve .ie n .SS """partition_urls""" .el .SS "\f(CWpartition_urls\fP" .IX Subsection "partition_urls" This function takes a string and splits it up into text and \s-1URL\s0 segments. It returns a list of array references, each of which has two elements: The type (the string \f(CW\*(AqTEXT\*(Aq\fR or \f(CW\*(AqURL\*(Aq\fR) and the portion of the input string that was classified as text or \s-1URL,\s0 respectively. .PP Example: .PP .Vb 8 \& my $text = \*(AqVisit us at http://html5zombo.com. Also, https://archive.org\*(Aq; \& my @parts = partition_urls $text; \& # @parts = ( \& # [ \*(AqTEXT\*(Aq, \*(AqVisit us at \*(Aq ], \& # [ \*(AqURL\*(Aq, \*(Aqhttp://html5zombo.com\*(Aq ], \& # [ \*(AqTEXT\*(Aq, \*(Aq. Also, \*(Aq ], \& # [ \*(AqURL\*(Aq, \*(Aqhttps://archive.org\*(Aq ], \& # ) .Ve .PP You can reassemble the original string by concatenating the second elements of the returned arrayrefs, i.e. \&\f(CW\*(C`join(\*(Aq\*(Aq, map { $_\->[1] } partition_urls($text)) eq $text\*(C'\fR. .PP This function can be useful if you want to render plain text as \s-1HTML\s0 but hyperlink all embedded URLs: .PP .Vb 2 \& use URL::Search qw(partition_urls); \& use HTML::Entities qw(encode_entities); \& \& my $text = ...; \& \& my $html = \*(Aq\*(Aq; \& for my $part (partition_urls $text) { \& my ($type, $str) = @$part; \& $str = encode_entities $str; \& if ($type eq \*(AqURL\*(Aq) { \& $html .= "$str"; \& } else { \& $html .= $str; \& } \& } \& # result is in $html .Ve .SH "SUPPORT AND DOCUMENTATION" .IX Header "SUPPORT AND DOCUMENTATION" After installing, you can find documentation for this module with the \&\f(CW\*(C`perldoc\*(C'\fR command. .PP .Vb 1 \& perldoc URL::Search .Ve .PP You can also look for information at . .PP To see a list of open bugs, visit . .PP To report a new bug, send an email to \&\f(CW\*(C`bug\-URL\-Search [at] rt.cpan.org\*(C'\fR. .SH "AUTHOR" .IX Header "AUTHOR" Lukas Mai, \f(CW\*(C`\*(C'\fR .SH "COPYRIGHT & LICENSE" .IX Header "COPYRIGHT & LICENSE" Copyright 2016, 2017, 2023 Lukas Mai. .PP This program is free software; you can redistribute it and/or modify it under the terms of either: the \s-1GNU\s0 General Public License as published by the Free Software Foundation; or the Artistic License. .PP See for more information.