blob: 7e7964391f8ed7e95bfe696259aab3ca089a2681 [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""Utility routines for dealing with MIME types and decoding text files."""
6
7from __future__ import division
8from __future__ import print_function
9from __future__ import absolute_import
10
11import itertools
12import logging
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010013import six
Copybara854996b2021-09-07 19:36:02 +000014
15from framework import framework_constants
16
17
18_EXTENSION_TO_CTYPE_TABLE = {
19 # These are images/PDFs that we trust the browser to display.
20 'gif': 'image/gif',
21 'jpg': 'image/jpeg',
22 'jpeg': 'image/jpeg',
23 'png': 'image/png',
24 'webp': 'image/webp',
25 'ico': 'image/x-icon',
26 'svg': 'image/svg+xml',
27 'pdf': 'application/pdf',
28 'ogv': 'video/ogg',
29 'mov': 'video/quicktime',
30 'mp4': 'video/mp4',
31 'mpg': 'video/mp4',
32 'mpeg': 'video/mp4',
33 'webm': 'video/webm',
34
35 # We do not serve mimetypes that cause the brower to launch a local
36 # app because that is not required for issue tracking and it is a
37 # potential security risk.
38}
39
40
41def GuessContentTypeFromFilename(filename):
42 """Guess a file's content type based on the filename extension.
43
44 Args:
45 filename: String name of a file.
46
47 Returns:
48 MIME type string to use when serving this file. We only use text/plain for
49 text files, appropriate image content-types, or application/octet-stream
50 for virtually all binary files. This limits the richness of the user's
51 experience, e.g., the user cannot open an MS Office application directly
52 by clicking on an attachment, but it is safer.
53 """
54 ext = filename.split('.')[-1] if ('.' in filename) else ''
55 ext = ext.lower()
56 if ext in COMMON_TEXT_FILE_EXTENSIONS:
57 return 'text/plain'
58 return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')
59
60
61# Constants used in detecting if a file has binary content.
62# All line lengths must be below the upper limit, and there must be a spefic
63# ratio below the lower limit.
64_MAX_SOURCE_LINE_LEN_LOWER = 350
65_MAX_SOURCE_LINE_LEN_UPPER = 800
66_SOURCE_LINE_LEN_LOWER_RATIO = 0.9
67
68# Message to display for undecodable commit log or author values.
69UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'
70
71# How large a repository file is in bytes before we don't try to display it
72SOURCE_FILE_MAX_SIZE = 1000 * 1024
73SOURCE_FILE_MAX_LINES = 50000
74
75# The source code browser will not attempt to display any filename ending
76# with one of these extensions.
77COMMON_BINARY_FILE_EXTENSIONS = {
78 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',
79 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',
80 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',
81 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',
82 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',
83 'graffie', 'violet', 'webm', 'webp',
84 }
85
86# The source code browser will display file contents as text data for files
87# with the following extensions or exact filenames (assuming they decode
88# correctly).
89COMMON_TEXT_FILE_EXTENSIONS = (
90 set(framework_constants.PRETTIFY_CLASS_MAP.keys()) | {
91 '',
92 'ada',
93 'asan',
94 'asm',
95 'asp',
96 'bat',
97 'cgi',
98 'csv',
99 'diff',
100 'el',
101 'emacs',
102 'jsp',
103 'log',
104 'markdown',
105 'md',
106 'mf',
107 'patch',
108 'plist',
109 'properties',
110 'r',
111 'rc',
112 'txt',
113 'vim',
114 'wiki',
115 'xemacs',
116 'yacc',
117 })
118COMMON_TEXT_FILENAMES = (
119 set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.keys()) |
120 {'authors', 'install', 'readme'})
121
122
123def DecodeFileContents(file_contents, path=None):
124 """Try converting file contents to unicode using utf-8 or latin-1.
125
126 This is applicable to untrusted maybe-text from vcs files or inbound emails.
127
128 We try decoding the file as utf-8, then fall back on latin-1. In the former
129 case, we call the file a text file; in the latter case, we guess whether
130 the file is text or binary based on line length.
131
132 If we guess text when the file is binary, the user sees safely encoded
133 gibberish. If the other way around, the user sees a message that we will
134 not display the file.
135
136 TODO(jrobbins): we could try the user-supplied encoding, iff it
137 is one of the encodings that we know that we can handle.
138
139 Args:
140 file_contents: byte string from uploaded file. It could be text in almost
141 any encoding, or binary. We cannot trust the user-supplied encoding
142 in the mime-type property.
143 path: string pathname of file.
144
145 Returns:
146 The tuple (unicode_string, is_binary, is_long):
147 - The unicode version of the string.
148 - is_binary is true if the string could not be decoded as text.
149 - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.
150 """
151 # If the filename is one that typically identifies a binary file, then
152 # just treat it as binary without any further analysis.
153 ext = None
154 if path and '.' in path:
155 ext = path.split('.')[-1]
156 if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:
157 # If the file is binary, we don't care about the length, since we don't
158 # show or diff it.
159 return u'', True, False
160
161 # If the string can be decoded as utf-8, we treat it as textual.
162 try:
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100163 u_str = six.ensure_text(file_contents)
Copybara854996b2021-09-07 19:36:02 +0000164 is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES
165 return u_str, False, is_long
166 except UnicodeDecodeError:
167 logging.info('not a utf-8 file: %s bytes', len(file_contents))
168
169 # Fall back on latin-1. This will always succeed, since every byte maps to
170 # something in latin-1, even if that something is gibberish.
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100171 u_str = six.ensure_text(file_contents, encoding='latin-1')
Copybara854996b2021-09-07 19:36:02 +0000172
173 lines = u_str.split('\n')
174 is_long = len(lines) > SOURCE_FILE_MAX_LINES
175 # Treat decodable files with certain filenames and/or extensions as text
176 # files. This avoids problems with common file types using our text/binary
177 # heuristic rules below.
178 if path:
179 name = path.split('/')[-1]
180 if (name.lower() in COMMON_TEXT_FILENAMES or
181 (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):
182 return u_str, False, is_long
183
184 # HEURISTIC: Binary files can qualify as latin-1, so we need to
185 # check further. Any real source code is going to be divided into
186 # reasonably sized lines. All lines must be below an upper character limit,
187 # and most lines must be below a lower limit. This allows some exceptions
188 # to the lower limit, but is more restrictive than just using a single
189 # large character limit.
190 is_binary = False
191 lower_count = 0
192 for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):
193 size = len(line)
194 if size <= _MAX_SOURCE_LINE_LEN_LOWER:
195 lower_count += 1
196 elif size > _MAX_SOURCE_LINE_LEN_UPPER:
197 is_binary = True
198 break
199
200 ratio = lower_count / float(max(1, len(lines)))
201 if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:
202 is_binary = True
203
204 return u_str, is_binary, is_long