Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/framework/filecontent.py b/framework/filecontent.py
new file mode 100644
index 0000000..15d2940
--- /dev/null
+++ b/framework/filecontent.py
@@ -0,0 +1,204 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""Utility routines for dealing with MIME types and decoding text files."""
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+
+import itertools
+import logging
+
+from framework import framework_constants
+
+
+_EXTENSION_TO_CTYPE_TABLE = {
+ # These are images/PDFs that we trust the browser to display.
+ 'gif': 'image/gif',
+ 'jpg': 'image/jpeg',
+ 'jpeg': 'image/jpeg',
+ 'png': 'image/png',
+ 'webp': 'image/webp',
+ 'ico': 'image/x-icon',
+ 'svg': 'image/svg+xml',
+ 'pdf': 'application/pdf',
+ 'ogv': 'video/ogg',
+ 'mov': 'video/quicktime',
+ 'mp4': 'video/mp4',
+ 'mpg': 'video/mp4',
+ 'mpeg': 'video/mp4',
+ 'webm': 'video/webm',
+
+ # We do not serve mimetypes that cause the brower to launch a local
+ # app because that is not required for issue tracking and it is a
+ # potential security risk.
+}
+
+
+def GuessContentTypeFromFilename(filename):
+ """Guess a file's content type based on the filename extension.
+
+ Args:
+ filename: String name of a file.
+
+ Returns:
+ MIME type string to use when serving this file. We only use text/plain for
+ text files, appropriate image content-types, or application/octet-stream
+ for virtually all binary files. This limits the richness of the user's
+ experience, e.g., the user cannot open an MS Office application directly
+ by clicking on an attachment, but it is safer.
+ """
+ ext = filename.split('.')[-1] if ('.' in filename) else ''
+ ext = ext.lower()
+ if ext in COMMON_TEXT_FILE_EXTENSIONS:
+ return 'text/plain'
+ return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')
+
+
+# Constants used in detecting if a file has binary content.
+# All line lengths must be below the upper limit, and there must be a spefic
+# ratio below the lower limit.
+_MAX_SOURCE_LINE_LEN_LOWER = 350
+_MAX_SOURCE_LINE_LEN_UPPER = 800
+_SOURCE_LINE_LEN_LOWER_RATIO = 0.9
+
+# Message to display for undecodable commit log or author values.
+UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'
+
+# How large a repository file is in bytes before we don't try to display it
+SOURCE_FILE_MAX_SIZE = 1000 * 1024
+SOURCE_FILE_MAX_LINES = 50000
+
+# The source code browser will not attempt to display any filename ending
+# with one of these extensions.
+COMMON_BINARY_FILE_EXTENSIONS = {
+ 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',
+ 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',
+ 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',
+ 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',
+ 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',
+ 'graffie', 'violet', 'webm', 'webp',
+ }
+
+# The source code browser will display file contents as text data for files
+# with the following extensions or exact filenames (assuming they decode
+# correctly).
+COMMON_TEXT_FILE_EXTENSIONS = (
+ set(framework_constants.PRETTIFY_CLASS_MAP.keys()) | {
+ '',
+ 'ada',
+ 'asan',
+ 'asm',
+ 'asp',
+ 'bat',
+ 'cgi',
+ 'csv',
+ 'diff',
+ 'el',
+ 'emacs',
+ 'jsp',
+ 'log',
+ 'markdown',
+ 'md',
+ 'mf',
+ 'patch',
+ 'plist',
+ 'properties',
+ 'r',
+ 'rc',
+ 'txt',
+ 'vim',
+ 'wiki',
+ 'xemacs',
+ 'yacc',
+ })
+COMMON_TEXT_FILENAMES = (
+ set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.keys()) |
+ {'authors', 'install', 'readme'})
+
+
+def DecodeFileContents(file_contents, path=None):
+ """Try converting file contents to unicode using utf-8 or latin-1.
+
+ This is applicable to untrusted maybe-text from vcs files or inbound emails.
+
+ We try decoding the file as utf-8, then fall back on latin-1. In the former
+ case, we call the file a text file; in the latter case, we guess whether
+ the file is text or binary based on line length.
+
+ If we guess text when the file is binary, the user sees safely encoded
+ gibberish. If the other way around, the user sees a message that we will
+ not display the file.
+
+ TODO(jrobbins): we could try the user-supplied encoding, iff it
+ is one of the encodings that we know that we can handle.
+
+ Args:
+ file_contents: byte string from uploaded file. It could be text in almost
+ any encoding, or binary. We cannot trust the user-supplied encoding
+ in the mime-type property.
+ path: string pathname of file.
+
+ Returns:
+ The tuple (unicode_string, is_binary, is_long):
+ - The unicode version of the string.
+ - is_binary is true if the string could not be decoded as text.
+ - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.
+ """
+ # If the filename is one that typically identifies a binary file, then
+ # just treat it as binary without any further analysis.
+ ext = None
+ if path and '.' in path:
+ ext = path.split('.')[-1]
+ if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:
+ # If the file is binary, we don't care about the length, since we don't
+ # show or diff it.
+ return u'', True, False
+
+ # If the string can be decoded as utf-8, we treat it as textual.
+ try:
+ u_str = file_contents.decode('utf-8', 'strict')
+ is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES
+ return u_str, False, is_long
+ except UnicodeDecodeError:
+ logging.info('not a utf-8 file: %s bytes', len(file_contents))
+
+ # Fall back on latin-1. This will always succeed, since every byte maps to
+ # something in latin-1, even if that something is gibberish.
+ u_str = file_contents.decode('latin-1', 'strict')
+
+ lines = u_str.split('\n')
+ is_long = len(lines) > SOURCE_FILE_MAX_LINES
+ # Treat decodable files with certain filenames and/or extensions as text
+ # files. This avoids problems with common file types using our text/binary
+ # heuristic rules below.
+ if path:
+ name = path.split('/')[-1]
+ if (name.lower() in COMMON_TEXT_FILENAMES or
+ (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):
+ return u_str, False, is_long
+
+ # HEURISTIC: Binary files can qualify as latin-1, so we need to
+ # check further. Any real source code is going to be divided into
+ # reasonably sized lines. All lines must be below an upper character limit,
+ # and most lines must be below a lower limit. This allows some exceptions
+ # to the lower limit, but is more restrictive than just using a single
+ # large character limit.
+ is_binary = False
+ lower_count = 0
+ for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):
+ size = len(line)
+ if size <= _MAX_SOURCE_LINE_LEN_LOWER:
+ lower_count += 1
+ elif size > _MAX_SOURCE_LINE_LEN_UPPER:
+ is_binary = True
+ break
+
+ ratio = lower_count / float(max(1, len(lines)))
+ if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:
+ is_binary = True
+
+ return u_str, is_binary, is_long