Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 4 | |
| 5 | """Utility routines for dealing with MIME types and decoding text files.""" |
| 6 | |
| 7 | from __future__ import division |
| 8 | from __future__ import print_function |
| 9 | from __future__ import absolute_import |
| 10 | |
| 11 | import itertools |
| 12 | import logging |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 13 | import six |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 14 | |
| 15 | from framework import framework_constants |
| 16 | |
| 17 | |
| 18 | _EXTENSION_TO_CTYPE_TABLE = { |
| 19 | # These are images/PDFs that we trust the browser to display. |
| 20 | 'gif': 'image/gif', |
| 21 | 'jpg': 'image/jpeg', |
| 22 | 'jpeg': 'image/jpeg', |
| 23 | 'png': 'image/png', |
| 24 | 'webp': 'image/webp', |
| 25 | 'ico': 'image/x-icon', |
| 26 | 'svg': 'image/svg+xml', |
| 27 | 'pdf': 'application/pdf', |
| 28 | 'ogv': 'video/ogg', |
| 29 | 'mov': 'video/quicktime', |
| 30 | 'mp4': 'video/mp4', |
| 31 | 'mpg': 'video/mp4', |
| 32 | 'mpeg': 'video/mp4', |
| 33 | 'webm': 'video/webm', |
| 34 | |
| 35 | # We do not serve mimetypes that cause the brower to launch a local |
| 36 | # app because that is not required for issue tracking and it is a |
| 37 | # potential security risk. |
| 38 | } |
| 39 | |
| 40 | |
| 41 | def GuessContentTypeFromFilename(filename): |
| 42 | """Guess a file's content type based on the filename extension. |
| 43 | |
| 44 | Args: |
| 45 | filename: String name of a file. |
| 46 | |
| 47 | Returns: |
| 48 | MIME type string to use when serving this file. We only use text/plain for |
| 49 | text files, appropriate image content-types, or application/octet-stream |
| 50 | for virtually all binary files. This limits the richness of the user's |
| 51 | experience, e.g., the user cannot open an MS Office application directly |
| 52 | by clicking on an attachment, but it is safer. |
| 53 | """ |
| 54 | ext = filename.split('.')[-1] if ('.' in filename) else '' |
| 55 | ext = ext.lower() |
| 56 | if ext in COMMON_TEXT_FILE_EXTENSIONS: |
| 57 | return 'text/plain' |
| 58 | return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream') |
| 59 | |
| 60 | |
| 61 | # Constants used in detecting if a file has binary content. |
| 62 | # All line lengths must be below the upper limit, and there must be a spefic |
| 63 | # ratio below the lower limit. |
| 64 | _MAX_SOURCE_LINE_LEN_LOWER = 350 |
| 65 | _MAX_SOURCE_LINE_LEN_UPPER = 800 |
| 66 | _SOURCE_LINE_LEN_LOWER_RATIO = 0.9 |
| 67 | |
| 68 | # Message to display for undecodable commit log or author values. |
| 69 | UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]' |
| 70 | |
| 71 | # How large a repository file is in bytes before we don't try to display it |
| 72 | SOURCE_FILE_MAX_SIZE = 1000 * 1024 |
| 73 | SOURCE_FILE_MAX_LINES = 50000 |
| 74 | |
| 75 | # The source code browser will not attempt to display any filename ending |
| 76 | # with one of these extensions. |
| 77 | COMMON_BINARY_FILE_EXTENSIONS = { |
| 78 | 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx', |
| 79 | 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd', |
| 80 | 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib', |
| 81 | 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz', |
| 82 | 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt', |
| 83 | 'graffie', 'violet', 'webm', 'webp', |
| 84 | } |
| 85 | |
| 86 | # The source code browser will display file contents as text data for files |
| 87 | # with the following extensions or exact filenames (assuming they decode |
| 88 | # correctly). |
| 89 | COMMON_TEXT_FILE_EXTENSIONS = ( |
| 90 | set(framework_constants.PRETTIFY_CLASS_MAP.keys()) | { |
| 91 | '', |
| 92 | 'ada', |
| 93 | 'asan', |
| 94 | 'asm', |
| 95 | 'asp', |
| 96 | 'bat', |
| 97 | 'cgi', |
| 98 | 'csv', |
| 99 | 'diff', |
| 100 | 'el', |
| 101 | 'emacs', |
| 102 | 'jsp', |
| 103 | 'log', |
| 104 | 'markdown', |
| 105 | 'md', |
| 106 | 'mf', |
| 107 | 'patch', |
| 108 | 'plist', |
| 109 | 'properties', |
| 110 | 'r', |
| 111 | 'rc', |
| 112 | 'txt', |
| 113 | 'vim', |
| 114 | 'wiki', |
| 115 | 'xemacs', |
| 116 | 'yacc', |
| 117 | }) |
| 118 | COMMON_TEXT_FILENAMES = ( |
| 119 | set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.keys()) | |
| 120 | {'authors', 'install', 'readme'}) |
| 121 | |
| 122 | |
| 123 | def DecodeFileContents(file_contents, path=None): |
| 124 | """Try converting file contents to unicode using utf-8 or latin-1. |
| 125 | |
| 126 | This is applicable to untrusted maybe-text from vcs files or inbound emails. |
| 127 | |
| 128 | We try decoding the file as utf-8, then fall back on latin-1. In the former |
| 129 | case, we call the file a text file; in the latter case, we guess whether |
| 130 | the file is text or binary based on line length. |
| 131 | |
| 132 | If we guess text when the file is binary, the user sees safely encoded |
| 133 | gibberish. If the other way around, the user sees a message that we will |
| 134 | not display the file. |
| 135 | |
| 136 | TODO(jrobbins): we could try the user-supplied encoding, iff it |
| 137 | is one of the encodings that we know that we can handle. |
| 138 | |
| 139 | Args: |
| 140 | file_contents: byte string from uploaded file. It could be text in almost |
| 141 | any encoding, or binary. We cannot trust the user-supplied encoding |
| 142 | in the mime-type property. |
| 143 | path: string pathname of file. |
| 144 | |
| 145 | Returns: |
| 146 | The tuple (unicode_string, is_binary, is_long): |
| 147 | - The unicode version of the string. |
| 148 | - is_binary is true if the string could not be decoded as text. |
| 149 | - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines. |
| 150 | """ |
| 151 | # If the filename is one that typically identifies a binary file, then |
| 152 | # just treat it as binary without any further analysis. |
| 153 | ext = None |
| 154 | if path and '.' in path: |
| 155 | ext = path.split('.')[-1] |
| 156 | if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS: |
| 157 | # If the file is binary, we don't care about the length, since we don't |
| 158 | # show or diff it. |
| 159 | return u'', True, False |
| 160 | |
| 161 | # If the string can be decoded as utf-8, we treat it as textual. |
| 162 | try: |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 163 | u_str = six.ensure_text(file_contents) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 164 | is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES |
| 165 | return u_str, False, is_long |
| 166 | except UnicodeDecodeError: |
| 167 | logging.info('not a utf-8 file: %s bytes', len(file_contents)) |
| 168 | |
| 169 | # Fall back on latin-1. This will always succeed, since every byte maps to |
| 170 | # something in latin-1, even if that something is gibberish. |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 171 | u_str = six.ensure_text(file_contents, encoding='latin-1') |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 172 | |
| 173 | lines = u_str.split('\n') |
| 174 | is_long = len(lines) > SOURCE_FILE_MAX_LINES |
| 175 | # Treat decodable files with certain filenames and/or extensions as text |
| 176 | # files. This avoids problems with common file types using our text/binary |
| 177 | # heuristic rules below. |
| 178 | if path: |
| 179 | name = path.split('/')[-1] |
| 180 | if (name.lower() in COMMON_TEXT_FILENAMES or |
| 181 | (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)): |
| 182 | return u_str, False, is_long |
| 183 | |
| 184 | # HEURISTIC: Binary files can qualify as latin-1, so we need to |
| 185 | # check further. Any real source code is going to be divided into |
| 186 | # reasonably sized lines. All lines must be below an upper character limit, |
| 187 | # and most lines must be below a lower limit. This allows some exceptions |
| 188 | # to the lower limit, but is more restrictive than just using a single |
| 189 | # large character limit. |
| 190 | is_binary = False |
| 191 | lower_count = 0 |
| 192 | for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES): |
| 193 | size = len(line) |
| 194 | if size <= _MAX_SOURCE_LINE_LEN_LOWER: |
| 195 | lower_count += 1 |
| 196 | elif size > _MAX_SOURCE_LINE_LEN_UPPER: |
| 197 | is_binary = True |
| 198 | break |
| 199 | |
| 200 | ratio = lower_count / float(max(1, len(lines))) |
| 201 | if ratio < _SOURCE_LINE_LEN_LOWER_RATIO: |
| 202 | is_binary = True |
| 203 | |
| 204 | return u_str, is_binary, is_long |