parser.py 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360
  1. # -*- coding:iso-8859-1 -*-
  2. """
  3. This module offers a generic date/time string parser which is able to parse
  4. most known formats to represent a date and/or time.
  5. This module attempts to be forgiving with regards to unlikely input formats,
  6. returning a datetime object even for dates which are ambiguous. If an element
  7. of a date/time stamp is omitted, the following rules are applied:
  8. - If AM or PM is left unspecified, a 24-hour clock is assumed, however, an hour
  9. on a 12-hour clock (``0 <= hour <= 12``) *must* be specified if AM or PM is
  10. specified.
  11. - If a time zone is omitted, a timezone-naive datetime is returned.
  12. If any other elements are missing, they are taken from the
  13. :class:`datetime.datetime` object passed to the parameter ``default``. If this
  14. results in a day number exceeding the valid number of days per month, the
  15. value falls back to the end of the month.
  16. Additional resources about date/time string formats can be found below:
  17. - `A summary of the international standard date and time notation
  18. <http://www.cl.cam.ac.uk/~mgk25/iso-time.html>`_
  19. - `W3C Date and Time Formats <http://www.w3.org/TR/NOTE-datetime>`_
  20. - `Time Formats (Planetary Rings Node) <http://pds-rings.seti.org/tools/time_formats.html>`_
  21. - `CPAN ParseDate module
  22. <http://search.cpan.org/~muir/Time-modules-2013.0912/lib/Time/ParseDate.pm>`_
  23. - `Java SimpleDateFormat Class
  24. <https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html>`_
  25. """
  26. from __future__ import unicode_literals
  27. import datetime
  28. import string
  29. import time
  30. import collections
  31. import re
  32. from io import StringIO
  33. from calendar import monthrange, isleap
  34. from six import text_type, binary_type, integer_types
  35. from . import relativedelta
  36. from . import tz
  37. __all__ = ["parse", "parserinfo"]
  38. class _timelex(object):
  39. # Fractional seconds are sometimes split by a comma
  40. _split_decimal = re.compile("([\.,])")
  41. def __init__(self, instream):
  42. if isinstance(instream, binary_type):
  43. instream = instream.decode()
  44. if isinstance(instream, text_type):
  45. instream = StringIO(instream)
  46. if getattr(instream, 'read', None) is None:
  47. raise TypeError('Parser must be a string or character stream, not '
  48. '{itype}'.format(itype=instream.__class__.__name__))
  49. self.instream = instream
  50. self.charstack = []
  51. self.tokenstack = []
  52. self.eof = False
  53. def get_token(self):
  54. """
  55. This function breaks the time string into lexical units (tokens), which
  56. can be parsed by the parser. Lexical units are demarcated by changes in
  57. the character set, so any continuous string of letters is considered
  58. one unit, any continuous string of numbers is considered one unit.
  59. The main complication arises from the fact that dots ('.') can be used
  60. both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
  61. "4:30:21.447"). As such, it is necessary to read the full context of
  62. any dot-separated strings before breaking it into tokens; as such, this
  63. function maintains a "token stack", for when the ambiguous context
  64. demands that multiple tokens be parsed at once.
  65. """
  66. if self.tokenstack:
  67. return self.tokenstack.pop(0)
  68. seenletters = False
  69. token = None
  70. state = None
  71. while not self.eof:
  72. # We only realize that we've reached the end of a token when we
  73. # find a character that's not part of the current token - since
  74. # that character may be part of the next token, it's stored in the
  75. # charstack.
  76. if self.charstack:
  77. nextchar = self.charstack.pop(0)
  78. else:
  79. nextchar = self.instream.read(1)
  80. while nextchar == '\x00':
  81. nextchar = self.instream.read(1)
  82. if not nextchar:
  83. self.eof = True
  84. break
  85. elif not state:
  86. # First character of the token - determines if we're starting
  87. # to parse a word, a number or something else.
  88. token = nextchar
  89. if self.isword(nextchar):
  90. state = 'a'
  91. elif self.isnum(nextchar):
  92. state = '0'
  93. elif self.isspace(nextchar):
  94. token = ' '
  95. break # emit token
  96. else:
  97. break # emit token
  98. elif state == 'a':
  99. # If we've already started reading a word, we keep reading
  100. # letters until we find something that's not part of a word.
  101. seenletters = True
  102. if self.isword(nextchar):
  103. token += nextchar
  104. elif nextchar == '.':
  105. token += nextchar
  106. state = 'a.'
  107. else:
  108. self.charstack.append(nextchar)
  109. break # emit token
  110. elif state == '0':
  111. # If we've already started reading a number, we keep reading
  112. # numbers until we find something that doesn't fit.
  113. if self.isnum(nextchar):
  114. token += nextchar
  115. elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
  116. token += nextchar
  117. state = '0.'
  118. else:
  119. self.charstack.append(nextchar)
  120. break # emit token
  121. elif state == 'a.':
  122. # If we've seen some letters and a dot separator, continue
  123. # parsing, and the tokens will be broken up later.
  124. seenletters = True
  125. if nextchar == '.' or self.isword(nextchar):
  126. token += nextchar
  127. elif self.isnum(nextchar) and token[-1] == '.':
  128. token += nextchar
  129. state = '0.'
  130. else:
  131. self.charstack.append(nextchar)
  132. break # emit token
  133. elif state == '0.':
  134. # If we've seen at least one dot separator, keep going, we'll
  135. # break up the tokens later.
  136. if nextchar == '.' or self.isnum(nextchar):
  137. token += nextchar
  138. elif self.isword(nextchar) and token[-1] == '.':
  139. token += nextchar
  140. state = 'a.'
  141. else:
  142. self.charstack.append(nextchar)
  143. break # emit token
  144. if (state in ('a.', '0.') and (seenletters or token.count('.') > 1 or
  145. token[-1] in '.,')):
  146. l = self._split_decimal.split(token)
  147. token = l[0]
  148. for tok in l[1:]:
  149. if tok:
  150. self.tokenstack.append(tok)
  151. if state == '0.' and token.count('.') == 0:
  152. token = token.replace(',', '.')
  153. return token
  154. def __iter__(self):
  155. return self
  156. def __next__(self):
  157. token = self.get_token()
  158. if token is None:
  159. raise StopIteration
  160. return token
  161. def next(self):
  162. return self.__next__() # Python 2.x support
  163. @classmethod
  164. def split(cls, s):
  165. return list(cls(s))
  166. @classmethod
  167. def isword(cls, nextchar):
  168. """ Whether or not the next character is part of a word """
  169. return nextchar.isalpha()
  170. @classmethod
  171. def isnum(cls, nextchar):
  172. """ Whether the next character is part of a number """
  173. return nextchar.isdigit()
  174. @classmethod
  175. def isspace(cls, nextchar):
  176. """ Whether the next character is whitespace """
  177. return nextchar.isspace()
  178. class _resultbase(object):
  179. def __init__(self):
  180. for attr in self.__slots__:
  181. setattr(self, attr, None)
  182. def _repr(self, classname):
  183. l = []
  184. for attr in self.__slots__:
  185. value = getattr(self, attr)
  186. if value is not None:
  187. l.append("%s=%s" % (attr, repr(value)))
  188. return "%s(%s)" % (classname, ", ".join(l))
  189. def __len__(self):
  190. return (sum(getattr(self, attr) is not None
  191. for attr in self.__slots__))
  192. def __repr__(self):
  193. return self._repr(self.__class__.__name__)
  194. class parserinfo(object):
  195. """
  196. Class which handles what inputs are accepted. Subclass this to customize
  197. the language and acceptable values for each parameter.
  198. :param dayfirst:
  199. Whether to interpret the first value in an ambiguous 3-integer date
  200. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  201. ``yearfirst`` is set to ``True``, this distinguishes between YDM
  202. and YMD. Default is ``False``.
  203. :param yearfirst:
  204. Whether to interpret the first value in an ambiguous 3-integer date
  205. (e.g. 01/05/09) as the year. If ``True``, the first number is taken
  206. to be the year, otherwise the last number is taken to be the year.
  207. Default is ``False``.
  208. """
  209. # m from a.m/p.m, t from ISO T separator
  210. JUMP = [" ", ".", ",", ";", "-", "/", "'",
  211. "at", "on", "and", "ad", "m", "t", "of",
  212. "st", "nd", "rd", "th"]
  213. WEEKDAYS = [("Mon", "Monday"),
  214. ("Tue", "Tuesday"),
  215. ("Wed", "Wednesday"),
  216. ("Thu", "Thursday"),
  217. ("Fri", "Friday"),
  218. ("Sat", "Saturday"),
  219. ("Sun", "Sunday")]
  220. MONTHS = [("Jan", "January"),
  221. ("Feb", "February"),
  222. ("Mar", "March"),
  223. ("Apr", "April"),
  224. ("May", "May"),
  225. ("Jun", "June"),
  226. ("Jul", "July"),
  227. ("Aug", "August"),
  228. ("Sep", "Sept", "September"),
  229. ("Oct", "October"),
  230. ("Nov", "November"),
  231. ("Dec", "December")]
  232. HMS = [("h", "hour", "hours"),
  233. ("m", "minute", "minutes"),
  234. ("s", "second", "seconds")]
  235. AMPM = [("am", "a"),
  236. ("pm", "p")]
  237. UTCZONE = ["UTC", "GMT", "Z"]
  238. PERTAIN = ["of"]
  239. TZOFFSET = {}
  240. def __init__(self, dayfirst=False, yearfirst=False):
  241. self._jump = self._convert(self.JUMP)
  242. self._weekdays = self._convert(self.WEEKDAYS)
  243. self._months = self._convert(self.MONTHS)
  244. self._hms = self._convert(self.HMS)
  245. self._ampm = self._convert(self.AMPM)
  246. self._utczone = self._convert(self.UTCZONE)
  247. self._pertain = self._convert(self.PERTAIN)
  248. self.dayfirst = dayfirst
  249. self.yearfirst = yearfirst
  250. self._year = time.localtime().tm_year
  251. self._century = self._year // 100 * 100
  252. def _convert(self, lst):
  253. dct = {}
  254. for i, v in enumerate(lst):
  255. if isinstance(v, tuple):
  256. for v in v:
  257. dct[v.lower()] = i
  258. else:
  259. dct[v.lower()] = i
  260. return dct
  261. def jump(self, name):
  262. return name.lower() in self._jump
  263. def weekday(self, name):
  264. if len(name) >= 3:
  265. try:
  266. return self._weekdays[name.lower()]
  267. except KeyError:
  268. pass
  269. return None
  270. def month(self, name):
  271. if len(name) >= 3:
  272. try:
  273. return self._months[name.lower()] + 1
  274. except KeyError:
  275. pass
  276. return None
  277. def hms(self, name):
  278. try:
  279. return self._hms[name.lower()]
  280. except KeyError:
  281. return None
  282. def ampm(self, name):
  283. try:
  284. return self._ampm[name.lower()]
  285. except KeyError:
  286. return None
  287. def pertain(self, name):
  288. return name.lower() in self._pertain
  289. def utczone(self, name):
  290. return name.lower() in self._utczone
  291. def tzoffset(self, name):
  292. if name in self._utczone:
  293. return 0
  294. return self.TZOFFSET.get(name)
  295. def convertyear(self, year, century_specified=False):
  296. if year < 100 and not century_specified:
  297. year += self._century
  298. if abs(year - self._year) >= 50:
  299. if year < self._year:
  300. year += 100
  301. else:
  302. year -= 100
  303. return year
  304. def validate(self, res):
  305. # move to info
  306. if res.year is not None:
  307. res.year = self.convertyear(res.year, res.century_specified)
  308. if res.tzoffset == 0 and not res.tzname or res.tzname == 'Z':
  309. res.tzname = "UTC"
  310. res.tzoffset = 0
  311. elif res.tzoffset != 0 and res.tzname and self.utczone(res.tzname):
  312. res.tzoffset = 0
  313. return True
  314. class _ymd(list):
  315. def __init__(self, tzstr, *args, **kwargs):
  316. super(self.__class__, self).__init__(*args, **kwargs)
  317. self.century_specified = False
  318. self.tzstr = tzstr
  319. @staticmethod
  320. def token_could_be_year(token, year):
  321. try:
  322. return int(token) == year
  323. except ValueError:
  324. return False
  325. @staticmethod
  326. def find_potential_year_tokens(year, tokens):
  327. return [token for token in tokens if _ymd.token_could_be_year(token, year)]
  328. def find_probable_year_index(self, tokens):
  329. """
  330. attempt to deduce if a pre 100 year was lost
  331. due to padded zeros being taken off
  332. """
  333. for index, token in enumerate(self):
  334. potential_year_tokens = _ymd.find_potential_year_tokens(token, tokens)
  335. if len(potential_year_tokens) == 1 and len(potential_year_tokens[0]) > 2:
  336. return index
  337. def append(self, val):
  338. if hasattr(val, '__len__'):
  339. if val.isdigit() and len(val) > 2:
  340. self.century_specified = True
  341. elif val > 100:
  342. self.century_specified = True
  343. super(self.__class__, self).append(int(val))
  344. def resolve_ymd(self, mstridx, yearfirst, dayfirst):
  345. len_ymd = len(self)
  346. year, month, day = (None, None, None)
  347. if len_ymd > 3:
  348. raise ValueError("More than three YMD values")
  349. elif len_ymd == 1 or (mstridx != -1 and len_ymd == 2):
  350. # One member, or two members with a month string
  351. if mstridx != -1:
  352. month = self[mstridx]
  353. del self[mstridx]
  354. if len_ymd > 1 or mstridx == -1:
  355. if self[0] > 31:
  356. year = self[0]
  357. else:
  358. day = self[0]
  359. elif len_ymd == 2:
  360. # Two members with numbers
  361. if self[0] > 31:
  362. # 99-01
  363. year, month = self
  364. elif self[1] > 31:
  365. # 01-99
  366. month, year = self
  367. elif dayfirst and self[1] <= 12:
  368. # 13-01
  369. day, month = self
  370. else:
  371. # 01-13
  372. month, day = self
  373. elif len_ymd == 3:
  374. # Three members
  375. if mstridx == 0:
  376. month, day, year = self
  377. elif mstridx == 1:
  378. if self[0] > 31 or (yearfirst and self[2] <= 31):
  379. # 99-Jan-01
  380. year, month, day = self
  381. else:
  382. # 01-Jan-01
  383. # Give precendence to day-first, since
  384. # two-digit years is usually hand-written.
  385. day, month, year = self
  386. elif mstridx == 2:
  387. # WTF!?
  388. if self[1] > 31:
  389. # 01-99-Jan
  390. day, year, month = self
  391. else:
  392. # 99-01-Jan
  393. year, day, month = self
  394. else:
  395. if self[0] > 31 or \
  396. self.find_probable_year_index(_timelex.split(self.tzstr)) == 0 or \
  397. (yearfirst and self[1] <= 12 and self[2] <= 31):
  398. # 99-01-01
  399. if dayfirst and self[2] <= 12:
  400. year, day, month = self
  401. else:
  402. year, month, day = self
  403. elif self[0] > 12 or (dayfirst and self[1] <= 12):
  404. # 13-01-01
  405. day, month, year = self
  406. else:
  407. # 01-13-01
  408. month, day, year = self
  409. return year, month, day
  410. class parser(object):
  411. def __init__(self, info=None):
  412. self.info = info or parserinfo()
  413. def parse(self, timestr, default=None, ignoretz=False, tzinfos=None, **kwargs):
  414. """
  415. Parse the date/time string into a :class:`datetime.datetime` object.
  416. :param timestr:
  417. Any date/time string using the supported formats.
  418. :param default:
  419. The default datetime object, if this is a datetime object and not
  420. ``None``, elements specified in ``timestr`` replace elements in the
  421. default object.
  422. :param ignoretz:
  423. If set ``True``, time zones in parsed strings are ignored and a
  424. naive :class:`datetime.datetime` object is returned.
  425. :param tzinfos:
  426. Additional time zone names / aliases which may be present in the
  427. string. This argument maps time zone names (and optionally offsets
  428. from those time zones) to time zones. This parameter can be a
  429. dictionary with timezone aliases mapping time zone names to time
  430. zones or a function taking two parameters (``tzname`` and
  431. ``tzoffset``) and returning a time zone.
  432. The timezones to which the names are mapped can be an integer
  433. offset from UTC in minutes or a :class:`tzinfo` object.
  434. .. doctest::
  435. :options: +NORMALIZE_WHITESPACE
  436. >>> from dateutil.parser import parse
  437. >>> from dateutil.tz import gettz
  438. >>> tzinfos = {"BRST": -10800, "CST": gettz("America/Chicago")}
  439. >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
  440. datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -10800))
  441. >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
  442. datetime.datetime(2012, 1, 19, 17, 21,
  443. tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
  444. This parameter is ignored if ``ignoretz`` is set.
  445. :param **kwargs:
  446. Keyword arguments as passed to ``_parse()``.
  447. :return:
  448. Returns a :class:`datetime.datetime` object or, if the
  449. ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
  450. first element being a :class:`datetime.datetime` object, the second
  451. a tuple containing the fuzzy tokens.
  452. :raises ValueError:
  453. Raised for invalid or unknown string format, if the provided
  454. :class:`tzinfo` is not in a valid format, or if an invalid date
  455. would be created.
  456. :raises OverflowError:
  457. Raised if the parsed date exceeds the largest valid C integer on
  458. your system.
  459. """
  460. if default is None:
  461. effective_dt = datetime.datetime.now()
  462. default = datetime.datetime.now().replace(hour=0, minute=0,
  463. second=0, microsecond=0)
  464. else:
  465. effective_dt = default
  466. res, skipped_tokens = self._parse(timestr, **kwargs)
  467. if res is None:
  468. raise ValueError("Unknown string format")
  469. if len(res) == 0:
  470. raise ValueError("String does not contain a date.")
  471. repl = {}
  472. for attr in ("year", "month", "day", "hour",
  473. "minute", "second", "microsecond"):
  474. value = getattr(res, attr)
  475. if value is not None:
  476. repl[attr] = value
  477. if 'day' not in repl:
  478. # If the default day exceeds the last day of the month, fall back to
  479. # the end of the month.
  480. cyear = default.year if res.year is None else res.year
  481. cmonth = default.month if res.month is None else res.month
  482. cday = default.day if res.day is None else res.day
  483. if cday > monthrange(cyear, cmonth)[1]:
  484. repl['day'] = monthrange(cyear, cmonth)[1]
  485. ret = default.replace(**repl)
  486. if res.weekday is not None and not res.day:
  487. ret = ret+relativedelta.relativedelta(weekday=res.weekday)
  488. if not ignoretz:
  489. if (isinstance(tzinfos, collections.Callable) or
  490. tzinfos and res.tzname in tzinfos):
  491. if isinstance(tzinfos, collections.Callable):
  492. tzdata = tzinfos(res.tzname, res.tzoffset)
  493. else:
  494. tzdata = tzinfos.get(res.tzname)
  495. if isinstance(tzdata, datetime.tzinfo):
  496. tzinfo = tzdata
  497. elif isinstance(tzdata, text_type):
  498. tzinfo = tz.tzstr(tzdata)
  499. elif isinstance(tzdata, integer_types):
  500. tzinfo = tz.tzoffset(res.tzname, tzdata)
  501. else:
  502. raise ValueError("Offset must be tzinfo subclass, "
  503. "tz string, or int offset.")
  504. ret = ret.replace(tzinfo=tzinfo)
  505. elif res.tzname and res.tzname in time.tzname:
  506. ret = ret.replace(tzinfo=tz.tzlocal())
  507. elif res.tzoffset == 0:
  508. ret = ret.replace(tzinfo=tz.tzutc())
  509. elif res.tzoffset:
  510. ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
  511. if kwargs.get('fuzzy_with_tokens', False):
  512. return ret, skipped_tokens
  513. else:
  514. return ret
  515. class _result(_resultbase):
  516. __slots__ = ["year", "month", "day", "weekday",
  517. "hour", "minute", "second", "microsecond",
  518. "tzname", "tzoffset", "ampm"]
  519. def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False,
  520. fuzzy_with_tokens=False):
  521. """
  522. Private method which performs the heavy lifting of parsing, called from
  523. ``parse()``, which passes on its ``kwargs`` to this function.
  524. :param timestr:
  525. The string to parse.
  526. :param dayfirst:
  527. Whether to interpret the first value in an ambiguous 3-integer date
  528. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  529. ``yearfirst`` is set to ``True``, this distinguishes between YDM
  530. and YMD. If set to ``None``, this value is retrieved from the
  531. current :class:`parserinfo` object (which itself defaults to
  532. ``False``).
  533. :param yearfirst:
  534. Whether to interpret the first value in an ambiguous 3-integer date
  535. (e.g. 01/05/09) as the year. If ``True``, the first number is taken
  536. to be the year, otherwise the last number is taken to be the year.
  537. If this is set to ``None``, the value is retrieved from the current
  538. :class:`parserinfo` object (which itself defaults to ``False``).
  539. :param fuzzy:
  540. Whether to allow fuzzy parsing, allowing for string like "Today is
  541. January 1, 2047 at 8:21:00AM".
  542. :param fuzzy_with_tokens:
  543. If ``True``, ``fuzzy`` is automatically set to True, and the parser
  544. will return a tuple where the first element is the parsed
  545. :class:`datetime.datetime` datetimestamp and the second element is
  546. a tuple containing the portions of the string which were ignored:
  547. .. doctest::
  548. >>> from dateutil.parser import parse
  549. >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
  550. (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
  551. """
  552. if fuzzy_with_tokens:
  553. fuzzy = True
  554. info = self.info
  555. if dayfirst is None:
  556. dayfirst = info.dayfirst
  557. if yearfirst is None:
  558. yearfirst = info.yearfirst
  559. res = self._result()
  560. l = _timelex.split(timestr) # Splits the timestr into tokens
  561. # keep up with the last token skipped so we can recombine
  562. # consecutively skipped tokens (-2 for when i begins at 0).
  563. last_skipped_token_i = -2
  564. skipped_tokens = list()
  565. try:
  566. # year/month/day list
  567. ymd = _ymd(timestr)
  568. # Index of the month string in ymd
  569. mstridx = -1
  570. len_l = len(l)
  571. i = 0
  572. while i < len_l:
  573. # Check if it's a number
  574. try:
  575. value_repr = l[i]
  576. value = float(value_repr)
  577. except ValueError:
  578. value = None
  579. if value is not None:
  580. # Token is a number
  581. len_li = len(l[i])
  582. i += 1
  583. if (len(ymd) == 3 and len_li in (2, 4)
  584. and res.hour is None and (i >= len_l or (l[i] != ':' and
  585. info.hms(l[i]) is None))):
  586. # 19990101T23[59]
  587. s = l[i-1]
  588. res.hour = int(s[:2])
  589. if len_li == 4:
  590. res.minute = int(s[2:])
  591. elif len_li == 6 or (len_li > 6 and l[i-1].find('.') == 6):
  592. # YYMMDD or HHMMSS[.ss]
  593. s = l[i-1]
  594. if not ymd and l[i-1].find('.') == -1:
  595. #ymd.append(info.convertyear(int(s[:2])))
  596. ymd.append(s[:2])
  597. ymd.append(s[2:4])
  598. ymd.append(s[4:])
  599. else:
  600. # 19990101T235959[.59]
  601. res.hour = int(s[:2])
  602. res.minute = int(s[2:4])
  603. res.second, res.microsecond = _parsems(s[4:])
  604. elif len_li in (8, 12, 14):
  605. # YYYYMMDD
  606. s = l[i-1]
  607. ymd.append(s[:4])
  608. ymd.append(s[4:6])
  609. ymd.append(s[6:8])
  610. if len_li > 8:
  611. res.hour = int(s[8:10])
  612. res.minute = int(s[10:12])
  613. if len_li > 12:
  614. res.second = int(s[12:])
  615. elif ((i < len_l and info.hms(l[i]) is not None) or
  616. (i+1 < len_l and l[i] == ' ' and
  617. info.hms(l[i+1]) is not None)):
  618. # HH[ ]h or MM[ ]m or SS[.ss][ ]s
  619. if l[i] == ' ':
  620. i += 1
  621. idx = info.hms(l[i])
  622. while True:
  623. if idx == 0:
  624. res.hour = int(value)
  625. if value % 1:
  626. res.minute = int(60*(value % 1))
  627. elif idx == 1:
  628. res.minute = int(value)
  629. if value % 1:
  630. res.second = int(60*(value % 1))
  631. elif idx == 2:
  632. res.second, res.microsecond = \
  633. _parsems(value_repr)
  634. i += 1
  635. if i >= len_l or idx == 2:
  636. break
  637. # 12h00
  638. try:
  639. value_repr = l[i]
  640. value = float(value_repr)
  641. except ValueError:
  642. break
  643. else:
  644. i += 1
  645. idx += 1
  646. if i < len_l:
  647. newidx = info.hms(l[i])
  648. if newidx is not None:
  649. idx = newidx
  650. elif (i == len_l and l[i-2] == ' ' and
  651. info.hms(l[i-3]) is not None):
  652. # X h MM or X m SS
  653. idx = info.hms(l[i-3]) + 1
  654. if idx == 1:
  655. res.minute = int(value)
  656. if value % 1:
  657. res.second = int(60*(value % 1))
  658. elif idx == 2:
  659. res.second, res.microsecond = \
  660. _parsems(value_repr)
  661. i += 1
  662. elif i+1 < len_l and l[i] == ':':
  663. # HH:MM[:SS[.ss]]
  664. res.hour = int(value)
  665. i += 1
  666. value = float(l[i])
  667. res.minute = int(value)
  668. if value % 1:
  669. res.second = int(60*(value % 1))
  670. i += 1
  671. if i < len_l and l[i] == ':':
  672. res.second, res.microsecond = _parsems(l[i+1])
  673. i += 2
  674. elif i < len_l and l[i] in ('-', '/', '.'):
  675. sep = l[i]
  676. ymd.append(value_repr)
  677. i += 1
  678. if i < len_l and not info.jump(l[i]):
  679. try:
  680. # 01-01[-01]
  681. ymd.append(l[i])
  682. except ValueError:
  683. # 01-Jan[-01]
  684. value = info.month(l[i])
  685. if value is not None:
  686. ymd.append(value)
  687. assert mstridx == -1
  688. mstridx = len(ymd)-1
  689. else:
  690. return None, None
  691. i += 1
  692. if i < len_l and l[i] == sep:
  693. # We have three members
  694. i += 1
  695. value = info.month(l[i])
  696. if value is not None:
  697. ymd.append(value)
  698. mstridx = len(ymd)-1
  699. assert mstridx == -1
  700. else:
  701. ymd.append(l[i])
  702. i += 1
  703. elif i >= len_l or info.jump(l[i]):
  704. if i+1 < len_l and info.ampm(l[i+1]) is not None:
  705. # 12 am
  706. res.hour = int(value)
  707. if res.hour < 12 and info.ampm(l[i+1]) == 1:
  708. res.hour += 12
  709. elif res.hour == 12 and info.ampm(l[i+1]) == 0:
  710. res.hour = 0
  711. i += 1
  712. else:
  713. # Year, month or day
  714. ymd.append(value)
  715. i += 1
  716. elif info.ampm(l[i]) is not None:
  717. # 12am
  718. res.hour = int(value)
  719. if res.hour < 12 and info.ampm(l[i]) == 1:
  720. res.hour += 12
  721. elif res.hour == 12 and info.ampm(l[i]) == 0:
  722. res.hour = 0
  723. i += 1
  724. elif not fuzzy:
  725. return None, None
  726. else:
  727. i += 1
  728. continue
  729. # Check weekday
  730. value = info.weekday(l[i])
  731. if value is not None:
  732. res.weekday = value
  733. i += 1
  734. continue
  735. # Check month name
  736. value = info.month(l[i])
  737. if value is not None:
  738. ymd.append(value)
  739. assert mstridx == -1
  740. mstridx = len(ymd)-1
  741. i += 1
  742. if i < len_l:
  743. if l[i] in ('-', '/'):
  744. # Jan-01[-99]
  745. sep = l[i]
  746. i += 1
  747. ymd.append(l[i])
  748. i += 1
  749. if i < len_l and l[i] == sep:
  750. # Jan-01-99
  751. i += 1
  752. ymd.append(l[i])
  753. i += 1
  754. elif (i+3 < len_l and l[i] == l[i+2] == ' '
  755. and info.pertain(l[i+1])):
  756. # Jan of 01
  757. # In this case, 01 is clearly year
  758. try:
  759. value = int(l[i+3])
  760. except ValueError:
  761. # Wrong guess
  762. pass
  763. else:
  764. # Convert it here to become unambiguous
  765. ymd.append(str(info.convertyear(value)))
  766. i += 4
  767. continue
  768. # Check am/pm
  769. value = info.ampm(l[i])
  770. if value is not None:
  771. # For fuzzy parsing, 'a' or 'am' (both valid English words)
  772. # may erroneously trigger the AM/PM flag. Deal with that
  773. # here.
  774. val_is_ampm = True
  775. # If there's already an AM/PM flag, this one isn't one.
  776. if fuzzy and res.ampm is not None:
  777. val_is_ampm = False
  778. # If AM/PM is found and hour is not, raise a ValueError
  779. if res.hour is None:
  780. if fuzzy:
  781. val_is_ampm = False
  782. else:
  783. raise ValueError('No hour specified with ' +
  784. 'AM or PM flag.')
  785. elif not 0 <= res.hour <= 12:
  786. # If AM/PM is found, it's a 12 hour clock, so raise
  787. # an error for invalid range
  788. if fuzzy:
  789. val_is_ampm = False
  790. else:
  791. raise ValueError('Invalid hour specified for ' +
  792. '12-hour clock.')
  793. if val_is_ampm:
  794. if value == 1 and res.hour < 12:
  795. res.hour += 12
  796. elif value == 0 and res.hour == 12:
  797. res.hour = 0
  798. res.ampm = value
  799. i += 1
  800. continue
  801. # Check for a timezone name
  802. if (res.hour is not None and len(l[i]) <= 5 and
  803. res.tzname is None and res.tzoffset is None and
  804. not [x for x in l[i] if x not in
  805. string.ascii_uppercase]):
  806. res.tzname = l[i]
  807. res.tzoffset = info.tzoffset(res.tzname)
  808. i += 1
  809. # Check for something like GMT+3, or BRST+3. Notice
  810. # that it doesn't mean "I am 3 hours after GMT", but
  811. # "my time +3 is GMT". If found, we reverse the
  812. # logic so that timezone parsing code will get it
  813. # right.
  814. if i < len_l and l[i] in ('+', '-'):
  815. l[i] = ('+', '-')[l[i] == '+']
  816. res.tzoffset = None
  817. if info.utczone(res.tzname):
  818. # With something like GMT+3, the timezone
  819. # is *not* GMT.
  820. res.tzname = None
  821. continue
  822. # Check for a numbered timezone
  823. if res.hour is not None and l[i] in ('+', '-'):
  824. signal = (-1, 1)[l[i] == '+']
  825. i += 1
  826. len_li = len(l[i])
  827. if len_li == 4:
  828. # -0300
  829. res.tzoffset = int(l[i][:2])*3600+int(l[i][2:])*60
  830. elif i+1 < len_l and l[i+1] == ':':
  831. # -03:00
  832. res.tzoffset = int(l[i])*3600+int(l[i+2])*60
  833. i += 2
  834. elif len_li <= 2:
  835. # -[0]3
  836. res.tzoffset = int(l[i][:2])*3600
  837. else:
  838. return None, None
  839. i += 1
  840. res.tzoffset *= signal
  841. # Look for a timezone name between parenthesis
  842. if (i+3 < len_l and
  843. info.jump(l[i]) and l[i+1] == '(' and l[i+3] == ')' and
  844. 3 <= len(l[i+2]) <= 5 and
  845. not [x for x in l[i+2]
  846. if x not in string.ascii_uppercase]):
  847. # -0300 (BRST)
  848. res.tzname = l[i+2]
  849. i += 4
  850. continue
  851. # Check jumps
  852. if not (info.jump(l[i]) or fuzzy):
  853. return None, None
  854. if last_skipped_token_i == i - 1:
  855. # recombine the tokens
  856. skipped_tokens[-1] += l[i]
  857. else:
  858. # just append
  859. skipped_tokens.append(l[i])
  860. last_skipped_token_i = i
  861. i += 1
  862. # Process year/month/day
  863. year, month, day = ymd.resolve_ymd(mstridx, yearfirst, dayfirst)
  864. if year is not None:
  865. res.year = year
  866. res.century_specified = ymd.century_specified
  867. if month is not None:
  868. res.month = month
  869. if day is not None:
  870. res.day = day
  871. except (IndexError, ValueError, AssertionError):
  872. return None, None
  873. if not info.validate(res):
  874. return None, None
  875. if fuzzy_with_tokens:
  876. return res, tuple(skipped_tokens)
  877. else:
  878. return res, None
  879. DEFAULTPARSER = parser()
  880. def parse(timestr, parserinfo=None, **kwargs):
  881. """
  882. Parse a string in one of the supported formats, using the
  883. ``parserinfo`` parameters.
  884. :param timestr:
  885. A string containing a date/time stamp.
  886. :param parserinfo:
  887. A :class:`parserinfo` object containing parameters for the parser.
  888. If ``None``, the default arguments to the :class:`parserinfo`
  889. constructor are used.
  890. The ``**kwargs`` parameter takes the following keyword arguments:
  891. :param default:
  892. The default datetime object, if this is a datetime object and not
  893. ``None``, elements specified in ``timestr`` replace elements in the
  894. default object.
  895. :param ignoretz:
  896. If set ``True``, time zones in parsed strings are ignored and a naive
  897. :class:`datetime` object is returned.
  898. :param tzinfos:
  899. Additional time zone names / aliases which may be present in the
  900. string. This argument maps time zone names (and optionally offsets
  901. from those time zones) to time zones. This parameter can be a
  902. dictionary with timezone aliases mapping time zone names to time
  903. zones or a function taking two parameters (``tzname`` and
  904. ``tzoffset``) and returning a time zone.
  905. The timezones to which the names are mapped can be an integer
  906. offset from UTC in minutes or a :class:`tzinfo` object.
  907. .. doctest::
  908. :options: +NORMALIZE_WHITESPACE
  909. >>> from dateutil.parser import parse
  910. >>> from dateutil.tz import gettz
  911. >>> tzinfos = {"BRST": -10800, "CST": gettz("America/Chicago")}
  912. >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
  913. datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -10800))
  914. >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
  915. datetime.datetime(2012, 1, 19, 17, 21,
  916. tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
  917. This parameter is ignored if ``ignoretz`` is set.
  918. :param dayfirst:
  919. Whether to interpret the first value in an ambiguous 3-integer date
  920. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  921. ``yearfirst`` is set to ``True``, this distinguishes between YDM and
  922. YMD. If set to ``None``, this value is retrieved from the current
  923. :class:`parserinfo` object (which itself defaults to ``False``).
  924. :param yearfirst:
  925. Whether to interpret the first value in an ambiguous 3-integer date
  926. (e.g. 01/05/09) as the year. If ``True``, the first number is taken to
  927. be the year, otherwise the last number is taken to be the year. If
  928. this is set to ``None``, the value is retrieved from the current
  929. :class:`parserinfo` object (which itself defaults to ``False``).
  930. :param fuzzy:
  931. Whether to allow fuzzy parsing, allowing for string like "Today is
  932. January 1, 2047 at 8:21:00AM".
  933. :param fuzzy_with_tokens:
  934. If ``True``, ``fuzzy`` is automatically set to True, and the parser
  935. will return a tuple where the first element is the parsed
  936. :class:`datetime.datetime` datetimestamp and the second element is
  937. a tuple containing the portions of the string which were ignored:
  938. .. doctest::
  939. >>> from dateutil.parser import parse
  940. >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
  941. (datetime.datetime(2011, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
  942. :return:
  943. Returns a :class:`datetime.datetime` object or, if the
  944. ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
  945. first element being a :class:`datetime.datetime` object, the second
  946. a tuple containing the fuzzy tokens.
  947. :raises ValueError:
  948. Raised for invalid or unknown string format, if the provided
  949. :class:`tzinfo` is not in a valid format, or if an invalid date
  950. would be created.
  951. :raises OverflowError:
  952. Raised if the parsed date exceeds the largest valid C integer on
  953. your system.
  954. """
  955. if parserinfo:
  956. return parser(parserinfo).parse(timestr, **kwargs)
  957. else:
  958. return DEFAULTPARSER.parse(timestr, **kwargs)
  959. class _tzparser(object):
  960. class _result(_resultbase):
  961. __slots__ = ["stdabbr", "stdoffset", "dstabbr", "dstoffset",
  962. "start", "end"]
  963. class _attr(_resultbase):
  964. __slots__ = ["month", "week", "weekday",
  965. "yday", "jyday", "day", "time"]
  966. def __repr__(self):
  967. return self._repr("")
  968. def __init__(self):
  969. _resultbase.__init__(self)
  970. self.start = self._attr()
  971. self.end = self._attr()
  972. def parse(self, tzstr):
  973. res = self._result()
  974. l = _timelex.split(tzstr)
  975. try:
  976. len_l = len(l)
  977. i = 0
  978. while i < len_l:
  979. # BRST+3[BRDT[+2]]
  980. j = i
  981. while j < len_l and not [x for x in l[j]
  982. if x in "0123456789:,-+"]:
  983. j += 1
  984. if j != i:
  985. if not res.stdabbr:
  986. offattr = "stdoffset"
  987. res.stdabbr = "".join(l[i:j])
  988. else:
  989. offattr = "dstoffset"
  990. res.dstabbr = "".join(l[i:j])
  991. i = j
  992. if (i < len_l and (l[i] in ('+', '-') or l[i][0] in
  993. "0123456789")):
  994. if l[i] in ('+', '-'):
  995. # Yes, that's right. See the TZ variable
  996. # documentation.
  997. signal = (1, -1)[l[i] == '+']
  998. i += 1
  999. else:
  1000. signal = -1
  1001. len_li = len(l[i])
  1002. if len_li == 4:
  1003. # -0300
  1004. setattr(res, offattr, (int(l[i][:2])*3600 +
  1005. int(l[i][2:])*60)*signal)
  1006. elif i+1 < len_l and l[i+1] == ':':
  1007. # -03:00
  1008. setattr(res, offattr,
  1009. (int(l[i])*3600+int(l[i+2])*60)*signal)
  1010. i += 2
  1011. elif len_li <= 2:
  1012. # -[0]3
  1013. setattr(res, offattr,
  1014. int(l[i][:2])*3600*signal)
  1015. else:
  1016. return None
  1017. i += 1
  1018. if res.dstabbr:
  1019. break
  1020. else:
  1021. break
  1022. if i < len_l:
  1023. for j in range(i, len_l):
  1024. if l[j] == ';':
  1025. l[j] = ','
  1026. assert l[i] == ','
  1027. i += 1
  1028. if i >= len_l:
  1029. pass
  1030. elif (8 <= l.count(',') <= 9 and
  1031. not [y for x in l[i:] if x != ','
  1032. for y in x if y not in "0123456789"]):
  1033. # GMT0BST,3,0,30,3600,10,0,26,7200[,3600]
  1034. for x in (res.start, res.end):
  1035. x.month = int(l[i])
  1036. i += 2
  1037. if l[i] == '-':
  1038. value = int(l[i+1])*-1
  1039. i += 1
  1040. else:
  1041. value = int(l[i])
  1042. i += 2
  1043. if value:
  1044. x.week = value
  1045. x.weekday = (int(l[i])-1) % 7
  1046. else:
  1047. x.day = int(l[i])
  1048. i += 2
  1049. x.time = int(l[i])
  1050. i += 2
  1051. if i < len_l:
  1052. if l[i] in ('-', '+'):
  1053. signal = (-1, 1)[l[i] == "+"]
  1054. i += 1
  1055. else:
  1056. signal = 1
  1057. res.dstoffset = (res.stdoffset+int(l[i]))*signal
  1058. elif (l.count(',') == 2 and l[i:].count('/') <= 2 and
  1059. not [y for x in l[i:] if x not in (',', '/', 'J', 'M',
  1060. '.', '-', ':')
  1061. for y in x if y not in "0123456789"]):
  1062. for x in (res.start, res.end):
  1063. if l[i] == 'J':
  1064. # non-leap year day (1 based)
  1065. i += 1
  1066. x.jyday = int(l[i])
  1067. elif l[i] == 'M':
  1068. # month[-.]week[-.]weekday
  1069. i += 1
  1070. x.month = int(l[i])
  1071. i += 1
  1072. assert l[i] in ('-', '.')
  1073. i += 1
  1074. x.week = int(l[i])
  1075. if x.week == 5:
  1076. x.week = -1
  1077. i += 1
  1078. assert l[i] in ('-', '.')
  1079. i += 1
  1080. x.weekday = (int(l[i])-1) % 7
  1081. else:
  1082. # year day (zero based)
  1083. x.yday = int(l[i])+1
  1084. i += 1
  1085. if i < len_l and l[i] == '/':
  1086. i += 1
  1087. # start time
  1088. len_li = len(l[i])
  1089. if len_li == 4:
  1090. # -0300
  1091. x.time = (int(l[i][:2])*3600+int(l[i][2:])*60)
  1092. elif i+1 < len_l and l[i+1] == ':':
  1093. # -03:00
  1094. x.time = int(l[i])*3600+int(l[i+2])*60
  1095. i += 2
  1096. if i+1 < len_l and l[i+1] == ':':
  1097. i += 2
  1098. x.time += int(l[i])
  1099. elif len_li <= 2:
  1100. # -[0]3
  1101. x.time = (int(l[i][:2])*3600)
  1102. else:
  1103. return None
  1104. i += 1
  1105. assert i == len_l or l[i] == ','
  1106. i += 1
  1107. assert i >= len_l
  1108. except (IndexError, ValueError, AssertionError):
  1109. return None
  1110. return res
  1111. DEFAULTTZPARSER = _tzparser()
  1112. def _parsetz(tzstr):
  1113. return DEFAULTTZPARSER.parse(tzstr)
  1114. def _parsems(value):
  1115. """Parse a I[.F] seconds value into (seconds, microseconds)."""
  1116. if "." not in value:
  1117. return int(value), 0
  1118. else:
  1119. i, f = value.split(".")
  1120. return int(i), int(f.ljust(6, "0")[:6])
  1121. # vim:ts=4:sw=4:et