Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style |
| 3 | # license that can be found in the LICENSE file or at |
| 4 | # https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | """A set of Python input field validators.""" |
| 7 | from __future__ import print_function |
| 8 | from __future__ import division |
| 9 | from __future__ import absolute_import |
| 10 | |
| 11 | import re |
| 12 | |
| 13 | # RFC 2821-compliant email address regex |
| 14 | # |
| 15 | # Please see sections "4.1.2 Command Argument Syntax" and |
| 16 | # "4.1.3 Address Literals" of: http://www.faqs.org/rfcs/rfc2821.html |
| 17 | # |
| 18 | # The following implementation is still a subset of RFC 2821. Fully |
| 19 | # double-quoted <user> parts are not supported (since the RFC discourages |
| 20 | # their use anyway), and using the backslash to escape other characters |
| 21 | # that are normally invalid, such as commas, is not supported. |
| 22 | # |
| 23 | # The groups in this regular expression are: |
| 24 | # |
| 25 | # <user>: all of the valid non-quoted portion of the email address before |
| 26 | # the @ sign (not including the @ sign) |
| 27 | # |
| 28 | # <domain>: all of the domain name between the @ sign (but not including it) |
| 29 | # and the dot before the TLD (but not including that final dot) |
| 30 | # |
| 31 | # <tld>: the top-level domain after the last dot (but not including that |
| 32 | # final dot) |
| 33 | # |
| 34 | _RFC_2821_EMAIL_REGEX = r"""(?x) |
| 35 | (?P<user> |
| 36 | # Part of the username that comes before any dots that may occur in it. |
| 37 | # At least one of the listed non-dot characters is required before the |
| 38 | # first dot. |
| 39 | [-a-zA-Z0-9!#$%&'*+/=?^_`{|}~]+ |
| 40 | |
| 41 | # Remaining part of the username that starts with the dot and |
| 42 | # which may have other dots, if such a part exists. Only one dot |
| 43 | # is permitted between each "Atom", and a trailing dot is not permitted. |
| 44 | (?:[.][-a-zA-Z0-9!#$%&'*+/=?^_`{|}~]+)* |
| 45 | ) |
| 46 | |
| 47 | # Domain name, where subdomains are allowed. Also, dashes are allowed |
| 48 | # given that they are preceded and followed by at least one character. |
| 49 | @(?P<domain> |
| 50 | (?:[0-9a-zA-Z] # at least one non-dash |
| 51 | (?:[-]* # plus zero or more dashes |
| 52 | [0-9a-zA-Z]+ # plus at least one non-dash |
| 53 | )* # zero or more of dashes followed by non-dashes |
| 54 | ) # one required domain part (may be a sub-domain) |
| 55 | |
| 56 | (?:\. # dot separator before additional sub-domain part |
| 57 | [0-9a-zA-Z] # at least one non-dash |
| 58 | (?:[-]* # plus zero or more dashes |
| 59 | [0-9a-zA-Z]+ # plus at least one non-dash |
| 60 | )* # zero or more of dashes followed by non-dashes |
| 61 | )* # at least one sub-domain part and a dot |
| 62 | ) |
| 63 | \. # dot separator before TLD |
| 64 | |
| 65 | # TLD, the part after 'usernames@domain.' which can consist of 2-9 |
| 66 | # letters. |
| 67 | (?P<tld>[a-zA-Z]{2,9}) |
| 68 | """ |
| 69 | |
| 70 | # object used with <re>.search() or <re>.sub() to find email addresses |
| 71 | # within a string (or with <re>.match() to find email addresses at the |
| 72 | # beginning of a string that may be followed by trailing characters, |
| 73 | # since <re>.match() implicitly anchors at the beginning of the string) |
| 74 | RE_EMAIL_SEARCH = re.compile(_RFC_2821_EMAIL_REGEX) |
| 75 | |
| 76 | # object used with <re>.match to find strings that contain *only* a single |
| 77 | # email address (by adding the end-of-string anchor $) |
| 78 | RE_EMAIL_ONLY = re.compile('^%s$' % _RFC_2821_EMAIL_REGEX) |
| 79 | |
| 80 | _SCHEME_PATTERN = r'(?:https?|ftp)://' |
| 81 | _SHORT_HOST_PATTERN = ( |
| 82 | r'(?=[a-zA-Z])[-a-zA-Z0-9]*[a-zA-Z0-9](:[0-9]+)?' |
| 83 | r'/' # Slash is manditory for short host names. |
| 84 | r'[^\s]*' |
| 85 | ) |
| 86 | _DOTTED_HOST_PATTERN = ( |
| 87 | r'[-a-zA-Z0-9.]+\.[a-zA-Z]{2,9}(:[0-9]+)?' |
| 88 | r'(/[^\s]*)?' |
| 89 | ) |
| 90 | _URL_REGEX = r'%s(%s|%s)' % ( |
| 91 | _SCHEME_PATTERN, _SHORT_HOST_PATTERN, _DOTTED_HOST_PATTERN) |
| 92 | |
| 93 | # A more complete URL regular expression based on a combination of the |
| 94 | # existing _URL_REGEX and the pattern found for URI regular expressions |
| 95 | # found in the URL RFC document. It's detailed here: |
| 96 | # http://www.ietf.org/rfc/rfc2396.txt |
| 97 | RE_COMPLEX_URL = re.compile(r'^%s(\?([^# ]*))?(#(.*))?$' % _URL_REGEX) |
| 98 | |
| 99 | |
| 100 | def IsValidEmail(s): |
| 101 | """Return true iff the string is a properly formatted email address.""" |
| 102 | return RE_EMAIL_ONLY.match(s) |
| 103 | |
| 104 | |
| 105 | def IsValidMailTo(s): |
| 106 | """Return true iff the string is a properly formatted mailto:.""" |
| 107 | return s.startswith('mailto:') and RE_EMAIL_ONLY.match(s[7:]) |
| 108 | |
| 109 | |
| 110 | def IsValidURL(s): |
| 111 | """Return true iff the string is a properly formatted web or ftp URL.""" |
| 112 | return RE_COMPLEX_URL.match(s) |