해당 서비스 도커화 성공, 룰 추가, 로그인 오류 수정, 소문자 룰 어느정도 해결

This commit is contained in:
Hyungi Ahn
2025-08-01 15:55:27 +09:00
parent ef06cec8d6
commit 809b2af53e
6418 changed files with 1922672 additions and 69 deletions

View File

@@ -0,0 +1,213 @@
# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
import os
import pprint
import sys
import zipfile
from . import timemachine
from .biffh import (
XL_CELL_BLANK, XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_EMPTY, XL_CELL_ERROR,
XL_CELL_NUMBER, XL_CELL_TEXT, XLRDError, biff_text_from_num,
error_text_from_code,
)
from .book import Book, colname, open_workbook_xls
from .compdoc import SIGNATURE as XLS_SIGNATURE
from .formula import * # is constrained by __all__
from .info import __VERSION__, __version__
from .sheet import empty_cell
from .xldate import XLDateError, xldate_as_datetime, xldate_as_tuple
#: descriptions of the file types :mod:`xlrd` can :func:`inspect <inspect_format>`.
FILE_FORMAT_DESCRIPTIONS = {
'xls': 'Excel xls',
'xlsb': 'Excel 2007 xlsb file',
'xlsx': 'Excel xlsx file',
'ods': 'Openoffice.org ODS file',
'zip': 'Unknown ZIP file',
None: 'Unknown file type',
}
ZIP_SIGNATURE = b"PK\x03\x04"
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
def inspect_format(path=None, content=None):
"""
Inspect the content at the supplied path or the :class:`bytes` content provided
and return the file's type as a :class:`str`, or ``None`` if it cannot
be determined.
:param path:
A :class:`string <str>` path containing the content to inspect.
``~`` will be expanded.
:param content:
The :class:`bytes` content to inspect.
:returns:
A :class:`str`, or ``None`` if the format cannot be determined.
The return value can always be looked up in :data:`FILE_FORMAT_DESCRIPTIONS`
to return a human-readable description of the format found.
"""
if content:
peek = content[:PEEK_SIZE]
else:
path = os.path.expanduser(path)
with open(path, "rb") as f:
peek = f.read(PEEK_SIZE)
if peek.startswith(XLS_SIGNATURE):
return 'xls'
if peek.startswith(ZIP_SIGNATURE):
zf = zipfile.ZipFile(timemachine.BYTES_IO(content) if content else path)
# Workaround for some third party files that use forward slashes and
# lower case names. We map the expected name in lowercase to the
# actual filename in the zip container.
component_names = {name.replace('\\', '/').lower(): name
for name in zf.namelist()}
if 'xl/workbook.xml' in component_names:
return 'xlsx'
if 'xl/workbook.bin' in component_names:
return 'xlsb'
if 'content.xml' in component_names:
return 'ods'
return 'zip'
def open_workbook(filename=None,
logfile=sys.stdout,
verbosity=0,
use_mmap=True,
file_contents=None,
encoding_override=None,
formatting_info=False,
on_demand=False,
ragged_rows=False,
ignore_workbook_corruption=False
):
"""
Open a spreadsheet file for data extraction.
:param filename: The path to the spreadsheet file to be opened.
:param logfile: An open file to which messages and diagnostics are written.
:param verbosity: Increases the volume of trace material written to the
logfile.
:param use_mmap:
Whether to use the mmap module is determined heuristically.
Use this arg to override the result.
Current heuristic: mmap is used if it exists.
:param file_contents:
A string or an :class:`mmap.mmap` object or some other behave-alike
object. If ``file_contents`` is supplied, ``filename`` will not be used,
except (possibly) in messages.
:param encoding_override:
Used to overcome missing or bad codepage information
in older-version files. See :doc:`unicode`.
:param formatting_info:
The default is ``False``, which saves memory.
In this case, "Blank" cells, which are those with their own formatting
information but no data, are treated as empty by ignoring the file's
``BLANK`` and ``MULBLANK`` records.
This cuts off any bottom or right "margin" of rows of empty or blank
cells.
Only :meth:`~xlrd.sheet.Sheet.cell_value` and
:meth:`~xlrd.sheet.Sheet.cell_type` are available.
When ``True``, formatting information will be read from the spreadsheet
file. This provides all cells, including empty and blank cells.
Formatting information is available for each cell.
Note that this will raise a NotImplementedError when used with an
xlsx file.
:param on_demand:
Governs whether sheets are all loaded initially or when demanded
by the caller. See :doc:`on_demand`.
:param ragged_rows:
The default of ``False`` means all rows are padded out with empty cells so
that all rows have the same size as found in
:attr:`~xlrd.sheet.Sheet.ncols`.
``True`` means that there are no empty cells at the ends of rows.
This can result in substantial memory savings if rows are of widely
varying sizes. See also the :meth:`~xlrd.sheet.Sheet.row_len` method.
:param ignore_workbook_corruption:
This option allows to read corrupted workbooks.
When ``False`` you may face CompDocError: Workbook corruption.
When ``True`` that exception will be ignored.
:returns: An instance of the :class:`~xlrd.book.Book` class.
"""
file_format = inspect_format(filename, file_contents)
# We have to let unknown file formats pass through here, as some ancient
# files that xlrd can parse don't start with the expected signature.
if file_format and file_format != 'xls':
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
bk = open_workbook_xls(
filename=filename,
logfile=logfile,
verbosity=verbosity,
use_mmap=use_mmap,
file_contents=file_contents,
encoding_override=encoding_override,
formatting_info=formatting_info,
on_demand=on_demand,
ragged_rows=ragged_rows,
ignore_workbook_corruption=ignore_workbook_corruption,
)
return bk
def dump(filename, outfile=sys.stdout, unnumbered=False):
"""
For debugging: dump an XLS file's BIFF records in char & hex.
:param filename: The path to the file to be dumped.
:param outfile: An open file, to which the dump is written.
:param unnumbered: If true, omit offsets (for meaningful diffs).
"""
from .biffh import biff_dump
bk = Book()
bk.biff2_8_load(filename=filename, logfile=outfile, )
biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)
def count_records(filename, outfile=sys.stdout):
"""
For debugging and analysis: summarise the file's BIFF records.
ie: produce a sorted file of ``(record_name, count)``.
:param filename: The path to the file to be summarised.
:param outfile: An open file, to which the summary is written.
"""
from .biffh import biff_count_records
bk = Book()
bk.biff2_8_load(filename=filename, logfile=outfile, )
biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)

View File

@@ -0,0 +1,643 @@
# -*- coding: utf-8 -*-
# Portions copyright © 2005-2010 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
from __future__ import print_function
import sys
from struct import unpack
from .timemachine import *
DEBUG = 0
class XLRDError(Exception):
"""
An exception indicating problems reading data from an Excel file.
"""
class BaseObject(object):
"""
Parent of almost all other classes in the package. Defines a common
:meth:`dump` method for debugging.
"""
_repr_these = []
def dump(self, f=None, header=None, footer=None, indent=0):
"""
:param f: open file object, to which the dump is written
:param header: text to write before the dump
:param footer: text to write after the dump
:param indent: number of leading spaces (for recursive calls)
"""
if f is None:
f = sys.stderr
if hasattr(self, "__slots__"):
alist = []
for attr in self.__slots__:
alist.append((attr, getattr(self, attr)))
else:
alist = self.__dict__.items()
alist = sorted(alist)
pad = " " * indent
if header is not None: print(header, file=f)
list_type = type([])
dict_type = type({})
for attr, value in alist:
if getattr(value, 'dump', None) and attr != 'book':
value.dump(f,
header="%s%s (%s object):" % (pad, attr, value.__class__.__name__),
indent=indent+4)
elif (attr not in self._repr_these and
(isinstance(value, list_type) or isinstance(value, dict_type))):
print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f)
else:
fprintf(f, "%s%s: %r\n", pad, attr, value)
if footer is not None: print(footer, file=f)
FUN, FDT, FNU, FGE, FTX = range(5) # unknown, date, number, general, text
DATEFORMAT = FDT
NUMBERFORMAT = FNU
(
XL_CELL_EMPTY,
XL_CELL_TEXT,
XL_CELL_NUMBER,
XL_CELL_DATE,
XL_CELL_BOOLEAN,
XL_CELL_ERROR,
XL_CELL_BLANK, # for use in debugging, gathering stats, etc
) = range(7)
biff_text_from_num = {
0: "(not BIFF)",
20: "2.0",
21: "2.1",
30: "3",
40: "4S",
45: "4W",
50: "5",
70: "7",
80: "8",
85: "8X",
}
#: This dictionary can be used to produce a text version of the internal codes
#: that Excel uses for error cells.
error_text_from_code = {
0x00: '#NULL!', # Intersection of two cell ranges is empty
0x07: '#DIV/0!', # Division by zero
0x0F: '#VALUE!', # Wrong type of operand
0x17: '#REF!', # Illegal or deleted cell reference
0x1D: '#NAME?', # Wrong function or range name
0x24: '#NUM!', # Value range overflow
0x2A: '#N/A', # Argument or function not available
}
BIFF_FIRST_UNICODE = 80
XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5
XL_WORKBOOK_GLOBALS_4W = 0x100
XL_WORKSHEET = WRKSHEET = 0x10
XL_BOUNDSHEET_WORKSHEET = 0x00
XL_BOUNDSHEET_CHART = 0x02
XL_BOUNDSHEET_VB_MODULE = 0x06
# XL_RK2 = 0x7e
XL_ARRAY = 0x0221
XL_ARRAY2 = 0x0021
XL_BLANK = 0x0201
XL_BLANK_B2 = 0x01
XL_BOF = 0x809
XL_BOOLERR = 0x205
XL_BOOLERR_B2 = 0x5
XL_BOUNDSHEET = 0x85
XL_BUILTINFMTCOUNT = 0x56
XL_CF = 0x01B1
XL_CODEPAGE = 0x42
XL_COLINFO = 0x7D
XL_COLUMNDEFAULT = 0x20 # BIFF2 only
XL_COLWIDTH = 0x24 # BIFF2 only
XL_CONDFMT = 0x01B0
XL_CONTINUE = 0x3c
XL_COUNTRY = 0x8C
XL_DATEMODE = 0x22
XL_DEFAULTROWHEIGHT = 0x0225
XL_DEFCOLWIDTH = 0x55
XL_DIMENSION = 0x200
XL_DIMENSION2 = 0x0
XL_EFONT = 0x45
XL_EOF = 0x0a
XL_EXTERNNAME = 0x23
XL_EXTERNSHEET = 0x17
XL_EXTSST = 0xff
XL_FEAT11 = 0x872
XL_FILEPASS = 0x2f
XL_FONT = 0x31
XL_FONT_B3B4 = 0x231
XL_FORMAT = 0x41e
XL_FORMAT2 = 0x1E # BIFF2, BIFF3
XL_FORMULA = 0x6
XL_FORMULA3 = 0x206
XL_FORMULA4 = 0x406
XL_GCW = 0xab
XL_HLINK = 0x01B8
XL_QUICKTIP = 0x0800
XL_HORIZONTALPAGEBREAKS = 0x1b
XL_INDEX = 0x20b
XL_INTEGER = 0x2 # BIFF2 only
XL_IXFE = 0x44 # BIFF2 only
XL_LABEL = 0x204
XL_LABEL_B2 = 0x04
XL_LABELRANGES = 0x15f
XL_LABELSST = 0xfd
XL_LEFTMARGIN = 0x26
XL_TOPMARGIN = 0x28
XL_RIGHTMARGIN = 0x27
XL_BOTTOMMARGIN = 0x29
XL_HEADER = 0x14
XL_FOOTER = 0x15
XL_HCENTER = 0x83
XL_VCENTER = 0x84
XL_MERGEDCELLS = 0xE5
XL_MSO_DRAWING = 0x00EC
XL_MSO_DRAWING_GROUP = 0x00EB
XL_MSO_DRAWING_SELECTION = 0x00ED
XL_MULRK = 0xbd
XL_MULBLANK = 0xbe
XL_NAME = 0x18
XL_NOTE = 0x1c
XL_NUMBER = 0x203
XL_NUMBER_B2 = 0x3
XL_OBJ = 0x5D
XL_PAGESETUP = 0xA1
XL_PALETTE = 0x92
XL_PANE = 0x41
XL_PRINTGRIDLINES = 0x2B
XL_PRINTHEADERS = 0x2A
XL_RK = 0x27e
XL_ROW = 0x208
XL_ROW_B2 = 0x08
XL_RSTRING = 0xd6
XL_SCL = 0x00A0
XL_SHEETHDR = 0x8F # BIFF4W only
XL_SHEETPR = 0x81
XL_SHEETSOFFSET = 0x8E # BIFF4W only
XL_SHRFMLA = 0x04bc
XL_SST = 0xfc
XL_STANDARDWIDTH = 0x99
XL_STRING = 0x207
XL_STRING_B2 = 0x7
XL_STYLE = 0x293
XL_SUPBOOK = 0x1AE # aka EXTERNALBOOK in OOo docs
XL_TABLEOP = 0x236
XL_TABLEOP2 = 0x37
XL_TABLEOP_B2 = 0x36
XL_TXO = 0x1b6
XL_UNCALCED = 0x5e
XL_UNKNOWN = 0xffff
XL_VERTICALPAGEBREAKS = 0x1a
XL_WINDOW2 = 0x023E
XL_WINDOW2_B2 = 0x003E
XL_WRITEACCESS = 0x5C
XL_WSBOOL = XL_SHEETPR
XL_XF = 0xe0
XL_XF2 = 0x0043 # BIFF2 version of XF record
XL_XF3 = 0x0243 # BIFF3 version of XF record
XL_XF4 = 0x0443 # BIFF4 version of XF record
boflen = {0x0809: 8, 0x0409: 6, 0x0209: 6, 0x0009: 4}
bofcodes = (0x0809, 0x0409, 0x0209, 0x0009)
XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206)
_cell_opcode_list = [
XL_BOOLERR,
XL_FORMULA,
XL_FORMULA3,
XL_FORMULA4,
XL_LABEL,
XL_LABELSST,
XL_MULRK,
XL_NUMBER,
XL_RK,
XL_RSTRING,
]
_cell_opcode_dict = {}
for _cell_opcode in _cell_opcode_list:
_cell_opcode_dict[_cell_opcode] = 1
def is_cell_opcode(c):
return c in _cell_opcode_dict
def upkbits(tgt_obj, src, manifest, local_setattr=setattr):
for n, mask, attr in manifest:
local_setattr(tgt_obj, attr, (src & mask) >> n)
def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int):
for n, mask, attr in manifest:
local_setattr(tgt_obj, attr, local_int((src & mask) >> n))
def unpack_string(data, pos, encoding, lenlen=1):
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
return unicode(data[pos:pos+nchars], encoding)
def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None):
if known_len is not None:
# On a NAME record, the length byte is detached from the front of the string.
nchars = known_len
else:
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
newpos = pos + nchars
return (unicode(data[pos:newpos], encoding), newpos)
def unpack_unicode(data, pos, lenlen=2):
"Return unicode_strg"
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
if not nchars:
# Ambiguous whether 0-length string should have an "options" byte.
# Avoid crash if missing.
return UNICODE_LITERAL("")
pos += lenlen
options = BYTES_ORD(data[pos])
pos += 1
# phonetic = options & 0x04
# richtext = options & 0x08
if options & 0x08:
# rt = unpack('<H', data[pos:pos+2])[0] # unused
pos += 2
if options & 0x04:
# sz = unpack('<i', data[pos:pos+4])[0] # unused
pos += 4
if options & 0x01:
# Uncompressed UTF-16-LE
rawstrg = data[pos:pos+2*nchars]
# if DEBUG: print "nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
strg = unicode(rawstrg, 'utf_16_le')
# pos += 2*nchars
else:
# Note: this is COMPRESSED (not ASCII!) encoding!!!
# Merely returning the raw bytes would work OK 99.99% of the time
# if the local codepage was cp1252 -- however this would rapidly go pear-shaped
# for other codepages so we grit our Anglocentric teeth and return Unicode :-)
strg = unicode(data[pos:pos+nchars], "latin_1")
# pos += nchars
# if richtext:
# pos += 4 * rt
# if phonetic:
# pos += sz
# return (strg, pos)
return strg
def unpack_unicode_update_pos(data, pos, lenlen=2, known_len=None):
"Return (unicode_strg, updated value of pos)"
if known_len is not None:
# On a NAME record, the length byte is detached from the front of the string.
nchars = known_len
else:
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
if not nchars and not data[pos:]:
# Zero-length string with no options byte
return (UNICODE_LITERAL(""), pos)
options = BYTES_ORD(data[pos])
pos += 1
phonetic = options & 0x04
richtext = options & 0x08
if richtext:
rt = unpack('<H', data[pos:pos+2])[0]
pos += 2
if phonetic:
sz = unpack('<i', data[pos:pos+4])[0]
pos += 4
if options & 0x01:
# Uncompressed UTF-16-LE
strg = unicode(data[pos:pos+2*nchars], 'utf_16_le')
pos += 2*nchars
else:
# Note: this is COMPRESSED (not ASCII!) encoding!!!
strg = unicode(data[pos:pos+nchars], "latin_1")
pos += nchars
if richtext:
pos += 4 * rt
if phonetic:
pos += sz
return (strg, pos)
def unpack_cell_range_address_list_update_pos(output_list, data, pos, biff_version, addr_size=6):
# output_list is updated in situ
assert addr_size in (6, 8)
# Used to assert size == 6 if not BIFF8, but pyWLWriter writes
# BIFF8-only MERGEDCELLS records in a BIFF5 file!
n, = unpack("<H", data[pos:pos+2])
pos += 2
if n:
if addr_size == 6:
fmt = "<HHBB"
else:
fmt = "<HHHH"
for _unused in xrange(n):
ra, rb, ca, cb = unpack(fmt, data[pos:pos+addr_size])
output_list.append((ra, rb+1, ca, cb+1))
pos += addr_size
return pos
_brecstrg = """\
0000 DIMENSIONS_B2
0001 BLANK_B2
0002 INTEGER_B2_ONLY
0003 NUMBER_B2
0004 LABEL_B2
0005 BOOLERR_B2
0006 FORMULA
0007 STRING_B2
0008 ROW_B2
0009 BOF_B2
000A EOF
000B INDEX_B2_ONLY
000C CALCCOUNT
000D CALCMODE
000E PRECISION
000F REFMODE
0010 DELTA
0011 ITERATION
0012 PROTECT
0013 PASSWORD
0014 HEADER
0015 FOOTER
0016 EXTERNCOUNT
0017 EXTERNSHEET
0018 NAME_B2,5+
0019 WINDOWPROTECT
001A VERTICALPAGEBREAKS
001B HORIZONTALPAGEBREAKS
001C NOTE
001D SELECTION
001E FORMAT_B2-3
001F BUILTINFMTCOUNT_B2
0020 COLUMNDEFAULT_B2_ONLY
0021 ARRAY_B2_ONLY
0022 DATEMODE
0023 EXTERNNAME
0024 COLWIDTH_B2_ONLY
0025 DEFAULTROWHEIGHT_B2_ONLY
0026 LEFTMARGIN
0027 RIGHTMARGIN
0028 TOPMARGIN
0029 BOTTOMMARGIN
002A PRINTHEADERS
002B PRINTGRIDLINES
002F FILEPASS
0031 FONT
0032 FONT2_B2_ONLY
0036 TABLEOP_B2
0037 TABLEOP2_B2
003C CONTINUE
003D WINDOW1
003E WINDOW2_B2
0040 BACKUP
0041 PANE
0042 CODEPAGE
0043 XF_B2
0044 IXFE_B2_ONLY
0045 EFONT_B2_ONLY
004D PLS
0051 DCONREF
0055 DEFCOLWIDTH
0056 BUILTINFMTCOUNT_B3-4
0059 XCT
005A CRN
005B FILESHARING
005C WRITEACCESS
005D OBJECT
005E UNCALCED
005F SAVERECALC
0063 OBJECTPROTECT
007D COLINFO
007E RK2_mythical_?
0080 GUTS
0081 WSBOOL
0082 GRIDSET
0083 HCENTER
0084 VCENTER
0085 BOUNDSHEET
0086 WRITEPROT
008C COUNTRY
008D HIDEOBJ
008E SHEETSOFFSET
008F SHEETHDR
0090 SORT
0092 PALETTE
0099 STANDARDWIDTH
009B FILTERMODE
009C FNGROUPCOUNT
009D AUTOFILTERINFO
009E AUTOFILTER
00A0 SCL
00A1 SETUP
00AB GCW
00BD MULRK
00BE MULBLANK
00C1 MMS
00D6 RSTRING
00D7 DBCELL
00DA BOOKBOOL
00DD SCENPROTECT
00E0 XF
00E1 INTERFACEHDR
00E2 INTERFACEEND
00E5 MERGEDCELLS
00E9 BITMAP
00EB MSO_DRAWING_GROUP
00EC MSO_DRAWING
00ED MSO_DRAWING_SELECTION
00EF PHONETIC
00FC SST
00FD LABELSST
00FF EXTSST
013D TABID
015F LABELRANGES
0160 USESELFS
0161 DSF
01AE SUPBOOK
01AF PROTECTIONREV4
01B0 CONDFMT
01B1 CF
01B2 DVAL
01B6 TXO
01B7 REFRESHALL
01B8 HLINK
01BC PASSWORDREV4
01BE DV
01C0 XL9FILE
01C1 RECALCID
0200 DIMENSIONS
0201 BLANK
0203 NUMBER
0204 LABEL
0205 BOOLERR
0206 FORMULA_B3
0207 STRING
0208 ROW
0209 BOF
020B INDEX_B3+
0218 NAME
0221 ARRAY
0223 EXTERNNAME_B3-4
0225 DEFAULTROWHEIGHT
0231 FONT_B3B4
0236 TABLEOP
023E WINDOW2
0243 XF_B3
027E RK
0293 STYLE
0406 FORMULA_B4
0409 BOF
041E FORMAT
0443 XF_B4
04BC SHRFMLA
0800 QUICKTIP
0809 BOF
0862 SHEETLAYOUT
0867 SHEETPROTECTION
0868 RANGEPROTECTION
"""
biff_rec_name_dict = {}
for _buff in _brecstrg.splitlines():
_numh, _name = _buff.split()
biff_rec_name_dict[int(_numh, 16)] = _name
del _buff, _name, _brecstrg
def hex_char_dump(strg, ofs, dlen, base=0, fout=sys.stdout, unnumbered=False):
endpos = min(ofs + dlen, len(strg))
pos = ofs
numbered = not unnumbered
num_prefix = ''
while pos < endpos:
endsub = min(pos + 16, endpos)
substrg = strg[pos:endsub]
lensub = endsub - pos
if lensub <= 0 or lensub != len(substrg):
fprintf(
sys.stdout,
'??? hex_char_dump: ofs=%d dlen=%d base=%d -> endpos=%d pos=%d endsub=%d substrg=%r\n',
ofs, dlen, base, endpos, pos, endsub, substrg)
break
hexd = ''.join("%02x " % BYTES_ORD(c) for c in substrg)
chard = ''
for c in substrg:
c = chr(BYTES_ORD(c))
if c == '\0':
c = '~'
elif not (' ' <= c <= '~'):
c = '?'
chard += c
if numbered:
num_prefix = "%5d: " % (base+pos-ofs)
fprintf(fout, "%s %-48s %s\n", num_prefix, hexd, chard)
pos = endsub
def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout, unnumbered=False):
pos = stream_offset
stream_end = stream_offset + stream_len
adj = base - stream_offset
dummies = 0
numbered = not unnumbered
num_prefix = ''
while stream_end - pos >= 4:
rc, length = unpack('<HH', mem[pos:pos+4])
if rc == 0 and length == 0:
if mem[pos:] == b'\0' * (stream_end - pos):
dummies = stream_end - pos
savpos = pos
pos = stream_end
break
if dummies:
dummies += 4
else:
savpos = pos
dummies = 4
pos += 4
else:
if dummies:
if numbered:
num_prefix = "%5d: " % (adj + savpos)
fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies)
dummies = 0
recname = biff_rec_name_dict.get(rc, '<UNKNOWN>')
if numbered:
num_prefix = "%5d: " % (adj + pos)
fprintf(fout, "%s%04x %s len = %04x (%d)\n", num_prefix, rc, recname, length, length)
pos += 4
hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered)
pos += length
if dummies:
if numbered:
num_prefix = "%5d: " % (adj + savpos)
fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies)
if pos < stream_end:
if numbered:
num_prefix = "%5d: " % (adj + pos)
fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix)
hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered)
elif pos > stream_end:
fprintf(fout, "Last dumped record has length (%d) that is too large\n", length)
def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout):
pos = stream_offset
stream_end = stream_offset + stream_len
tally = {}
while stream_end - pos >= 4:
rc, length = unpack('<HH', mem[pos:pos+4])
if rc == 0 and length == 0:
if mem[pos:] == b'\0' * (stream_end - pos):
break
recname = "<Dummy (zero)>"
else:
recname = biff_rec_name_dict.get(rc, None)
if recname is None:
recname = "Unknown_0x%04X" % rc
if recname in tally:
tally[recname] += 1
else:
tally[recname] = 1
pos += length + 4
slist = sorted(tally.items())
for recname, count in slist:
print("%8d %s" % (count, recname), file=fout)
encoding_from_codepage = {
1200 : 'utf_16_le',
10000: 'mac_roman',
10006: 'mac_greek', # guess
10007: 'mac_cyrillic', # guess
10029: 'mac_latin2', # guess
10079: 'mac_iceland', # guess
10081: 'mac_turkish', # guess
32768: 'mac_roman',
32769: 'cp1252',
}
# some more guessing, for Indic scripts
# codepage 57000 range:
# 2 Devanagari [0]
# 3 Bengali [1]
# 4 Tamil [5]
# 5 Telegu [6]
# 6 Assamese [1] c.f. Bengali
# 7 Oriya [4]
# 8 Kannada [7]
# 9 Malayalam [8]
# 10 Gujarati [3]
# 11 Gurmukhi [2]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,485 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
# No part of the content of this file was derived from the works of
# David Giffin.
"""
Implements the minimal functionality required
to extract a "Workbook" or "Book" stream (as one big string)
from an OLE2 Compound Document file.
"""
from __future__ import print_function
import array
import sys
from struct import unpack
from .timemachine import *
#: Magic cookie that should appear in the first 8 bytes of the file.
SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
EOCSID = -2
FREESID = -1
SATSID = -3
MSATSID = -4
EVILSID = -5
class CompDocError(Exception):
pass
class DirNode(object):
def __init__(self, DID, dent, DEBUG=0, logfile=sys.stdout):
# dent is the 128-byte directory entry
self.DID = DID
self.logfile = logfile
(cbufsize, self.etype, self.colour, self.left_DID, self.right_DID,
self.root_DID) = \
unpack('<HBBiii', dent[64:80])
(self.first_SID, self.tot_size) = \
unpack('<ii', dent[116:124])
if cbufsize == 0:
self.name = UNICODE_LITERAL('')
else:
self.name = unicode(dent[0:cbufsize-2], 'utf_16_le') # omit the trailing U+0000
self.children = [] # filled in later
self.parent = -1 # indicates orphan; fixed up later
self.tsinfo = unpack('<IIII', dent[100:116])
if DEBUG:
self.dump(DEBUG)
def dump(self, DEBUG=1):
fprintf(
self.logfile,
"DID=%d name=%r etype=%d DIDs(left=%d right=%d root=%d parent=%d kids=%r) first_SID=%d tot_size=%d\n",
self.DID, self.name, self.etype, self.left_DID,
self.right_DID, self.root_DID, self.parent, self.children, self.first_SID, self.tot_size
)
if DEBUG == 2:
# cre_lo, cre_hi, mod_lo, mod_hi = tsinfo
print("timestamp info", self.tsinfo, file=self.logfile)
def _build_family_tree(dirlist, parent_DID, child_DID):
if child_DID < 0: return
_build_family_tree(dirlist, parent_DID, dirlist[child_DID].left_DID)
dirlist[parent_DID].children.append(child_DID)
dirlist[child_DID].parent = parent_DID
_build_family_tree(dirlist, parent_DID, dirlist[child_DID].right_DID)
if dirlist[child_DID].etype == 1: # storage
_build_family_tree(dirlist, child_DID, dirlist[child_DID].root_DID)
class CompDoc(object):
"""
Compound document handler.
:param mem:
The raw contents of the file, as a string, or as an :class:`mmap.mmap`
object. The only operation it needs to support is slicing.
"""
def __init__(self, mem, logfile=sys.stdout, DEBUG=0, ignore_workbook_corruption=False):
self.logfile = logfile
self.ignore_workbook_corruption = ignore_workbook_corruption
self.DEBUG = DEBUG
if mem[0:8] != SIGNATURE:
raise CompDocError('Not an OLE2 compound document')
if mem[28:30] != b'\xFE\xFF':
raise CompDocError('Expected "little-endian" marker, found %r' % mem[28:30])
revision, version = unpack('<HH', mem[24:28])
if DEBUG:
print("\nCompDoc format: version=0x%04x revision=0x%04x" % (version, revision), file=logfile)
self.mem = mem
ssz, sssz = unpack('<HH', mem[30:34])
if ssz > 20: # allows for 2**20 bytes i.e. 1MB
print("WARNING: sector size (2**%d) is preposterous; assuming 512 and continuing ..."
% ssz, file=logfile)
ssz = 9
if sssz > ssz:
print("WARNING: short stream sector size (2**%d) is preposterous; assuming 64 and continuing ..."
% sssz, file=logfile)
sssz = 6
self.sec_size = sec_size = 1 << ssz
self.short_sec_size = 1 << sssz
if self.sec_size != 512 or self.short_sec_size != 64:
print("@@@@ sec_size=%d short_sec_size=%d" % (self.sec_size, self.short_sec_size), file=logfile)
(
SAT_tot_secs, self.dir_first_sec_sid, _unused, self.min_size_std_stream,
SSAT_first_sec_sid, SSAT_tot_secs,
MSATX_first_sec_sid, MSATX_tot_secs,
) = unpack('<iiiiiiii', mem[44:76])
mem_data_len = len(mem) - 512
mem_data_secs, left_over = divmod(mem_data_len, sec_size)
if left_over:
#### raise CompDocError("Not a whole number of sectors")
mem_data_secs += 1
print("WARNING *** file size (%d) not 512 + multiple of sector size (%d)"
% (len(mem), sec_size), file=logfile)
self.mem_data_secs = mem_data_secs # use for checking later
self.mem_data_len = mem_data_len
seen = self.seen = array.array('B', [0]) * mem_data_secs
if DEBUG:
print('sec sizes', ssz, sssz, sec_size, self.short_sec_size, file=logfile)
print("mem data: %d bytes == %d sectors" % (mem_data_len, mem_data_secs), file=logfile)
print("SAT_tot_secs=%d, dir_first_sec_sid=%d, min_size_std_stream=%d"
% (SAT_tot_secs, self.dir_first_sec_sid, self.min_size_std_stream,), file=logfile)
print("SSAT_first_sec_sid=%d, SSAT_tot_secs=%d" % (SSAT_first_sec_sid, SSAT_tot_secs,), file=logfile)
print("MSATX_first_sec_sid=%d, MSATX_tot_secs=%d" % (MSATX_first_sec_sid, MSATX_tot_secs,), file=logfile)
nent = sec_size // 4 # number of SID entries in a sector
fmt = "<%di" % nent
trunc_warned = 0
#
# === build the MSAT ===
#
MSAT = list(unpack('<109i', mem[76:512]))
SAT_sectors_reqd = (mem_data_secs + nent - 1) // nent
expected_MSATX_sectors = max(0, (SAT_sectors_reqd - 109 + nent - 2) // (nent - 1))
actual_MSATX_sectors = 0
if MSATX_tot_secs == 0 and MSATX_first_sec_sid in (EOCSID, FREESID, 0):
# Strictly, if there is no MSAT extension, then MSATX_first_sec_sid
# should be set to EOCSID ... FREESID and 0 have been met in the wild.
pass # Presuming no extension
else:
sid = MSATX_first_sec_sid
while sid not in (EOCSID, FREESID, MSATSID):
# Above should be only EOCSID according to MS & OOo docs
# but Excel doesn't complain about FREESID. Zero is a valid
# sector number, not a sentinel.
if DEBUG > 1:
print('MSATX: sid=%d (0x%08X)' % (sid, sid), file=logfile)
if sid >= mem_data_secs:
msg = "MSAT extension: accessing sector %d but only %d in file" % (sid, mem_data_secs)
if DEBUG > 1:
print(msg, file=logfile)
break
raise CompDocError(msg)
elif sid < 0:
raise CompDocError("MSAT extension: invalid sector id: %d" % sid)
if seen[sid]:
raise CompDocError("MSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
seen[sid] = 1
actual_MSATX_sectors += 1
if DEBUG and actual_MSATX_sectors > expected_MSATX_sectors:
print("[1]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
offset = 512 + sec_size * sid
MSAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
sid = MSAT.pop() # last sector id is sid of next sector in the chain
if DEBUG and actual_MSATX_sectors != expected_MSATX_sectors:
print("[2]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
if DEBUG:
print("MSAT: len =", len(MSAT), file=logfile)
dump_list(MSAT, 10, logfile)
#
# === build the SAT ===
#
self.SAT = []
actual_SAT_sectors = 0
dump_again = 0
for msidx in xrange(len(MSAT)):
msid = MSAT[msidx]
if msid in (FREESID, EOCSID):
# Specification: the MSAT array may be padded with trailing FREESID entries.
# Toleration: a FREESID or EOCSID entry anywhere in the MSAT array will be ignored.
continue
if msid >= mem_data_secs:
if not trunc_warned:
print("WARNING *** File is truncated, or OLE2 MSAT is corrupt!!", file=logfile)
print("INFO: Trying to access sector %d but only %d available"
% (msid, mem_data_secs), file=logfile)
trunc_warned = 1
MSAT[msidx] = EVILSID
dump_again = 1
continue
elif msid < -2:
raise CompDocError("MSAT: invalid sector id: %d" % msid)
if seen[msid]:
raise CompDocError("MSAT extension corruption: seen[%d] == %d" % (msid, seen[msid]))
seen[msid] = 2
actual_SAT_sectors += 1
if DEBUG and actual_SAT_sectors > SAT_sectors_reqd:
print("[3]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, actual_SAT_sectors, msid, file=logfile)
offset = 512 + sec_size * msid
self.SAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
if DEBUG:
print("SAT: len =", len(self.SAT), file=logfile)
dump_list(self.SAT, 10, logfile)
# print >> logfile, "SAT ",
# for i, s in enumerate(self.SAT):
# print >> logfile, "entry: %4d offset: %6d, next entry: %4d" % (i, 512 + sec_size * i, s)
# print >> logfile, "%d:%d " % (i, s),
print(file=logfile)
if DEBUG and dump_again:
print("MSAT: len =", len(MSAT), file=logfile)
dump_list(MSAT, 10, logfile)
for satx in xrange(mem_data_secs, len(self.SAT)):
self.SAT[satx] = EVILSID
print("SAT: len =", len(self.SAT), file=logfile)
dump_list(self.SAT, 10, logfile)
#
# === build the directory ===
#
dbytes = self._get_stream(
self.mem, 512, self.SAT, self.sec_size, self.dir_first_sec_sid,
name="directory", seen_id=3)
dirlist = []
did = -1
for pos in xrange(0, len(dbytes), 128):
did += 1
dirlist.append(DirNode(did, dbytes[pos:pos+128], 0, logfile))
self.dirlist = dirlist
_build_family_tree(dirlist, 0, dirlist[0].root_DID) # and stand well back ...
if DEBUG:
for d in dirlist:
d.dump(DEBUG)
#
# === get the SSCS ===
#
sscs_dir = self.dirlist[0]
assert sscs_dir.etype == 5 # root entry
if sscs_dir.first_SID < 0 or sscs_dir.tot_size == 0:
# Problem reported by Frank Hoffsuemmer: some software was
# writing -1 instead of -2 (EOCSID) for the first_SID
# when the SCCS was empty. Not having EOCSID caused assertion
# failure in _get_stream.
# Solution: avoid calling _get_stream in any case when the
# SCSS appears to be empty.
self.SSCS = ""
else:
self.SSCS = self._get_stream(
self.mem, 512, self.SAT, sec_size, sscs_dir.first_SID,
sscs_dir.tot_size, name="SSCS", seen_id=4)
# if DEBUG: print >> logfile, "SSCS", repr(self.SSCS)
#
# === build the SSAT ===
#
self.SSAT = []
if SSAT_tot_secs > 0 and sscs_dir.tot_size == 0:
print("WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero", file=logfile)
if sscs_dir.tot_size > 0:
sid = SSAT_first_sec_sid
nsecs = SSAT_tot_secs
while sid >= 0 and nsecs > 0:
if seen[sid]:
raise CompDocError("SSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
seen[sid] = 5
nsecs -= 1
start_pos = 512 + sid * sec_size
news = list(unpack(fmt, mem[start_pos:start_pos+sec_size]))
self.SSAT.extend(news)
sid = self.SAT[sid]
if DEBUG: print("SSAT last sid %d; remaining sectors %d" % (sid, nsecs), file=logfile)
assert nsecs == 0 and sid == EOCSID
if DEBUG:
print("SSAT", file=logfile)
dump_list(self.SSAT, 10, logfile)
if DEBUG:
print("seen", file=logfile)
dump_list(seen, 20, logfile)
def _get_stream(self, mem, base, sat, sec_size, start_sid, size=None, name='', seen_id=None):
# print >> self.logfile, "_get_stream", base, sec_size, start_sid, size
sectors = []
s = start_sid
if size is None:
# nothing to check against
while s >= 0:
if seen_id is not None:
if self.seen[s]:
raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
self.seen[s] = seen_id
start_pos = base + s * sec_size
sectors.append(mem[start_pos:start_pos+sec_size])
try:
s = sat[s]
except IndexError:
raise CompDocError(
"OLE2 stream %r: sector allocation table invalid entry (%d)" %
(name, s)
)
assert s == EOCSID
else:
todo = size
while s >= 0:
if seen_id is not None:
if self.seen[s]:
raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
self.seen[s] = seen_id
start_pos = base + s * sec_size
grab = sec_size
if grab > todo:
grab = todo
todo -= grab
sectors.append(mem[start_pos:start_pos+grab])
try:
s = sat[s]
except IndexError:
raise CompDocError(
"OLE2 stream %r: sector allocation table invalid entry (%d)" %
(name, s)
)
assert s == EOCSID
if todo != 0:
fprintf(self.logfile,
"WARNING *** OLE2 stream %r: expected size %d, actual size %d\n",
name, size, size - todo)
return b''.join(sectors)
def _dir_search(self, path, storage_DID=0):
# Return matching DirNode instance, or None
head = path[0]
tail = path[1:]
dl = self.dirlist
for child in dl[storage_DID].children:
if dl[child].name.lower() == head.lower():
et = dl[child].etype
if et == 2:
return dl[child]
if et == 1:
if not tail:
raise CompDocError("Requested component is a 'storage'")
return self._dir_search(tail, child)
dl[child].dump(1)
raise CompDocError("Requested stream is not a 'user stream'")
return None
def get_named_stream(self, qname):
"""
Interrogate the compound document's directory; return the stream as a
string if found, otherwise return ``None``.
:param qname:
Name of the desired stream e.g. ``'Workbook'``.
Should be in Unicode or convertible thereto.
"""
d = self._dir_search(qname.split("/"))
if d is None:
return None
if d.tot_size >= self.min_size_std_stream:
return self._get_stream(
self.mem, 512, self.SAT, self.sec_size, d.first_SID,
d.tot_size, name=qname, seen_id=d.DID+6)
else:
return self._get_stream(
self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
d.tot_size, name=qname + " (from SSCS)", seen_id=None)
def locate_named_stream(self, qname):
"""
Interrogate the compound document's directory.
If the named stream is not found, ``(None, 0, 0)`` will be returned.
If the named stream is found and is contiguous within the original
byte sequence (``mem``) used when the document was opened,
then ``(mem, offset_to_start_of_stream, length_of_stream)`` is returned.
Otherwise a new string is built from the fragments and
``(new_string, 0, length_of_stream)`` is returned.
:param qname:
Name of the desired stream e.g. ``'Workbook'``.
Should be in Unicode or convertible thereto.
"""
d = self._dir_search(qname.split("/"))
if d is None:
return (None, 0, 0)
if d.tot_size > self.mem_data_len:
raise CompDocError("%r stream length (%d bytes) > file data size (%d bytes)"
% (qname, d.tot_size, self.mem_data_len))
if d.tot_size >= self.min_size_std_stream:
result = self._locate_stream(
self.mem, 512, self.SAT, self.sec_size, d.first_SID,
d.tot_size, qname, d.DID+6)
if self.DEBUG:
print("\nseen", file=self.logfile)
dump_list(self.seen, 20, self.logfile)
return result
else:
return (
self._get_stream(
self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
d.tot_size, qname + " (from SSCS)", None),
0,
d.tot_size,
)
def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id):
# print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size
s = start_sid
if s < 0:
raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid)
p = -99 # dummy previous SID
start_pos = -9999
end_pos = -8888
slices = []
tot_found = 0
found_limit = (expected_stream_size + sec_size - 1) // sec_size
while s >= 0:
if self.seen[s]:
if not self.ignore_workbook_corruption:
print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile)
raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
self.seen[s] = seen_id
tot_found += 1
if tot_found > found_limit:
# Note: expected size rounded up to higher sector
raise CompDocError(
"%s: size exceeds expected %d bytes; corrupt?"
% (qname, found_limit * sec_size)
)
if s == p+1:
# contiguous sectors
end_pos += sec_size
else:
# start new slice
if p >= 0:
# not first time
slices.append((start_pos, end_pos))
start_pos = base + s * sec_size
end_pos = start_pos + sec_size
p = s
s = sat[s]
assert s == EOCSID
assert tot_found == found_limit
# print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile)
if not slices:
# The stream is contiguous ... just what we like!
return (mem, start_pos, expected_stream_size)
slices.append((start_pos, end_pos))
# print >> self.logfile, "+++>>> %d fragments" % len(slices)
return (b''.join(mem[start_pos:end_pos] for start_pos, end_pos in slices), 0, expected_stream_size)
# ==========================================================================================
def x_dump_line(alist, stride, f, dpos, equal=0):
print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
for value in alist[dpos:dpos + stride]:
print(str(value), end=' ', file=f)
print(file=f)
def dump_list(alist, stride, f=sys.stdout):
def _dump_line(dpos, equal=0):
print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
for value in alist[dpos:dpos + stride]:
print(str(value), end=' ', file=f)
print(file=f)
pos = None
oldpos = None
for pos in xrange(0, len(alist), stride):
if oldpos is None:
_dump_line(pos)
oldpos = pos
elif alist[pos:pos+stride] != alist[oldpos:oldpos+stride]:
if pos - oldpos > stride:
_dump_line(pos - stride, equal=1)
_dump_line(pos)
oldpos = pos
if oldpos is not None and pos is not None and pos != oldpos:
_dump_line(pos, equal=1)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
__version__ = __VERSION__ = "2.0.1"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,53 @@
##
# <p>Copyright (c) 2006-2012 Stephen John Machin, Lingfo Pty Ltd</p>
# <p>This module is part of the xlrd package, which is released under a BSD-style licence.</p>
##
# timemachine.py -- adaptation for single codebase.
# Currently supported: 2.6 to 2.7, 3.2+
# usage: from timemachine import *
from __future__ import print_function
import sys
python_version = sys.version_info[:2] # e.g. version 2.6 -> (2, 6)
if python_version >= (3, 0):
# Python 3
BYTES_LITERAL = lambda x: x.encode('latin1')
UNICODE_LITERAL = lambda x: x
BYTES_ORD = lambda byte: byte
from io import BytesIO as BYTES_IO
def fprintf(f, fmt, *vargs):
fmt = fmt.replace("%r", "%a")
if fmt.endswith('\n'):
print(fmt[:-1] % vargs, file=f)
else:
print(fmt % vargs, end=' ', file=f)
EXCEL_TEXT_TYPES = (str, bytes, bytearray) # xlwt: isinstance(obj, EXCEL_TEXT_TYPES)
REPR = ascii
xrange = range
unicode = lambda b, enc: b.decode(enc)
ensure_unicode = lambda s: s
unichr = chr
else:
# Python 2
BYTES_LITERAL = lambda x: x
UNICODE_LITERAL = lambda x: x.decode('latin1')
BYTES_ORD = ord
from cStringIO import StringIO as BYTES_IO
def fprintf(f, fmt, *vargs):
if fmt.endswith('\n'):
print(fmt[:-1] % vargs, file=f)
else:
print(fmt % vargs, end=' ', file=f)
try:
EXCEL_TEXT_TYPES = basestring # xlwt: isinstance(obj, EXCEL_TEXT_TYPES)
except NameError:
EXCEL_TEXT_TYPES = (str, unicode)
REPR = repr
xrange = xrange
# following used only to overcome 2.x ElementTree gimmick which
# returns text as `str` if it's ascii, otherwise `unicode`
ensure_unicode = unicode # used only in xlsx.py

View File

@@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2005-2008 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
# No part of the content of this file was derived from the works of David Giffin.
"""
Tools for working with dates and times in Excel files.
The conversion from ``days`` to ``(year, month, day)`` starts with
an integral "julian day number" aka JDN.
FWIW:
- JDN 0 corresponds to noon on Monday November 24 in Gregorian year -4713.
More importantly:
- Noon on Gregorian 1900-03-01 (day 61 in the 1900-based system) is JDN 2415080.0
- Noon on Gregorian 1904-01-02 (day 1 in the 1904-based system) is JDN 2416482.0
"""
import datetime
_JDN_delta = (2415080 - 61, 2416482 - 1)
assert _JDN_delta[1] - _JDN_delta[0] == 1462
# Pre-calculate the datetime epochs for efficiency.
epoch_1904 = datetime.datetime(1904, 1, 1)
epoch_1900 = datetime.datetime(1899, 12, 31)
epoch_1900_minus_1 = datetime.datetime(1899, 12, 30)
# This is equivalent to 10000-01-01:
_XLDAYS_TOO_LARGE = (2958466, 2958466 - 1462)
class XLDateError(ValueError):
"A base class for all datetime-related errors."
class XLDateNegative(XLDateError):
"``xldate < 0.00``"
class XLDateAmbiguous(XLDateError):
"The 1900 leap-year problem ``(datemode == 0 and 1.0 <= xldate < 61.0)``"
class XLDateTooLarge(XLDateError):
"Gregorian year 10000 or later"
class XLDateBadDatemode(XLDateError):
"``datemode`` arg is neither 0 nor 1"
class XLDateBadTuple(XLDateError):
pass
def xldate_as_tuple(xldate, datemode):
"""
Convert an Excel number (presumed to represent a date, a datetime or a time) into
a tuple suitable for feeding to datetime or mx.DateTime constructors.
:param xldate: The Excel number
:param datemode: 0: 1900-based, 1: 1904-based.
:raises xlrd.xldate.XLDateNegative:
:raises xlrd.xldate.XLDateAmbiguous:
:raises xlrd.xldate.XLDateTooLarge:
:raises xlrd.xldate.XLDateBadDatemode:
:raises xlrd.xldate.XLDateError:
:returns: Gregorian ``(year, month, day, hour, minute, nearest_second)``.
.. warning::
When using this function to interpret the contents of a workbook, you
should pass in the :attr:`~xlrd.book.Book.datemode`
attribute of that workbook. Whether the workbook has ever been anywhere
near a Macintosh is irrelevant.
.. admonition:: Special case
If ``0.0 <= xldate < 1.0``, it is assumed to represent a time;
``(0, 0, 0, hour, minute, second)`` will be returned.
.. note::
``1904-01-01`` is not regarded as a valid date in the ``datemode==1``
system; its "serial number" is zero.
"""
if datemode not in (0, 1):
raise XLDateBadDatemode(datemode)
if xldate == 0.00:
return (0, 0, 0, 0, 0, 0)
if xldate < 0.00:
raise XLDateNegative(xldate)
xldays = int(xldate)
frac = xldate - xldays
seconds = int(round(frac * 86400.0))
assert 0 <= seconds <= 86400
if seconds == 86400:
hour = minute = second = 0
xldays += 1
else:
# second = seconds % 60; minutes = seconds // 60
minutes, second = divmod(seconds, 60)
# minute = minutes % 60; hour = minutes // 60
hour, minute = divmod(minutes, 60)
if xldays >= _XLDAYS_TOO_LARGE[datemode]:
raise XLDateTooLarge(xldate)
if xldays == 0:
return (0, 0, 0, hour, minute, second)
if xldays < 61 and datemode == 0:
raise XLDateAmbiguous(xldate)
jdn = xldays + _JDN_delta[datemode]
yreg = ((((jdn * 4 + 274277) // 146097) * 3 // 4) + jdn + 1363) * 4 + 3
mp = ((yreg % 1461) // 4) * 535 + 333
d = ((mp % 16384) // 535) + 1
# mp /= 16384
mp >>= 14
if mp >= 10:
return ((yreg // 1461) - 4715, mp - 9, d, hour, minute, second)
else:
return ((yreg // 1461) - 4716, mp + 3, d, hour, minute, second)
def xldate_as_datetime(xldate, datemode):
"""
Convert an Excel date/time number into a :class:`datetime.datetime` object.
:param xldate: The Excel number
:param datemode: 0: 1900-based, 1: 1904-based.
:returns: A :class:`datetime.datetime` object.
"""
# Set the epoch based on the 1900/1904 datemode.
if datemode:
epoch = epoch_1904
else:
if xldate < 60:
epoch = epoch_1900
else:
# Workaround Excel 1900 leap year bug by adjusting the epoch.
epoch = epoch_1900_minus_1
# The integer part of the Excel date stores the number of days since
# the epoch and the fractional part stores the percentage of the day.
days = int(xldate)
fraction = xldate - days
# Get the the integer and decimal seconds in Excel's millisecond resolution.
seconds = int(round(fraction * 86400000.0))
seconds, milliseconds = divmod(seconds, 1000)
return epoch + datetime.timedelta(days, seconds, 0, milliseconds)
# === conversions from date/time to xl numbers
def _leap(y):
if y % 4: return 0
if y % 100: return 1
if y % 400: return 0
return 1
_days_in_month = (None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
def xldate_from_date_tuple(date_tuple, datemode):
"""
Convert a date tuple (year, month, day) to an Excel date.
:param year: Gregorian year.
:param month: ``1 <= month <= 12``
:param day: ``1 <= day <= last day of that (year, month)``
:param datemode: 0: 1900-based, 1: 1904-based.
:raises xlrd.xldate.XLDateAmbiguous:
:raises xlrd.xldate.XLDateBadDatemode:
:raises xlrd.xldate.XLDateBadTuple:
``(year, month, day)`` is too early/late or has invalid component(s)
:raises xlrd.xldate.XLDateError:
"""
year, month, day = date_tuple
if datemode not in (0, 1):
raise XLDateBadDatemode(datemode)
if year == 0 and month == 0 and day == 0:
return 0.00
if not (1900 <= year <= 9999):
raise XLDateBadTuple("Invalid year: %r" % ((year, month, day),))
if not (1 <= month <= 12):
raise XLDateBadTuple("Invalid month: %r" % ((year, month, day),))
if (day < 1 or
(day > _days_in_month[month] and not(day == 29 and month == 2 and _leap(year)))):
raise XLDateBadTuple("Invalid day: %r" % ((year, month, day),))
Yp = year + 4716
M = month
if M <= 2:
Yp = Yp - 1
Mp = M + 9
else:
Mp = M - 3
jdn = (1461 * Yp // 4) + ((979 * Mp + 16) // 32) + \
day - 1364 - (((Yp + 184) // 100) * 3 // 4)
xldays = jdn - _JDN_delta[datemode]
if xldays <= 0:
raise XLDateBadTuple("Invalid (year, month, day): %r" % ((year, month, day),))
if xldays < 61 and datemode == 0:
raise XLDateAmbiguous("Before 1900-03-01: %r" % ((year, month, day),))
return float(xldays)
def xldate_from_time_tuple(time_tuple):
"""
Convert a time tuple ``(hour, minute, second)`` to an Excel "date" value
(fraction of a day).
:param hour: ``0 <= hour < 24``
:param minute: ``0 <= minute < 60``
:param second: ``0 <= second < 60``
:raises xlrd.xldate.XLDateBadTuple: Out-of-range hour, minute, or second
"""
hour, minute, second = time_tuple
if 0 <= hour < 24 and 0 <= minute < 60 and 0 <= second < 60:
return ((second / 60.0 + minute) / 60.0 + hour) / 24.0
raise XLDateBadTuple("Invalid (hour, minute, second): %r" % ((hour, minute, second),))
def xldate_from_datetime_tuple(datetime_tuple, datemode):
"""
Convert a datetime tuple ``(year, month, day, hour, minute, second)`` to an
Excel date value.
For more details, refer to other xldate_from_*_tuple functions.
:param datetime_tuple: ``(year, month, day, hour, minute, second)``
:param datemode: 0: 1900-based, 1: 1904-based.
"""
return (
xldate_from_date_tuple(datetime_tuple[:3], datemode) +
xldate_from_time_tuple(datetime_tuple[3:])
)