"""Parse and interpret date strings.

Consider the following date notations (DMY=04.11.2012):

DMY     04.11.12        europe
YMD     12.11.04        iso
MDY     11.04.12        US
YDM     12.04.11        reverse US
DYM     04.12.11        too uncommon, ignored
MYD     11.12.04        too uncommon, ignored

There's the general problem of ambiguity between the DMY and MDY formats.
Here, we give precedence to the DMY format.

Note that the MDY format can still be used in unambiguous settings or
with the month spelled out, e.g. "2012, 23th of November"

Similarly, consider the following shortened date notations:

DM      04.11           europe, current year
MY      11.12           quarters
YM      12.11           quarters
MD      11.04           us, current year
DY      23.12           too uncommon, ignored
YD      12.23           too uncommon, ignored

In addition to the different spellings, month names can be spelled out
and the string can be cluttered with additional common words.

Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
from collections import Counter
from datetime import date as ddate, time as dtime, datetime, timedelta
from math import floor

# external imports
from dateutil.relativedelta import relativedelta
from pyparsing import Combine, Group, Literal, Optional, Or, Word, nums, oneOf, ParseException

# tagit imports
from tagit.utils import errors, Struct, flatten

# exports
__all__ = (
        # default format strings
        'DATE_FMT',
        'TIME_FMT',
        'DATETIME_FMT',
        # exceptions
        'DateParserError',
        'TimeParserError',
        'DateFormatError'
        # parsing
        'parse_datetime',
        'guess_datetime',
        # postprocessing
        'increment',
        )

## constants ##
"""Default strftime format strings."""
DATE_FMT = '%d.%m.%Y'
TIME_FMT = '%H:%M'
DATETIME_FMT = '%d.%m.%Y, %H:%M'

# Literal months
MONTH_LIT = {
    'Jan'       : 1,
    'January'   : 1,
    'Feb'       : 2,
    'February'  : 2,
    'Mar'       : 3,
    'March'     : 3,
    'Apr'       : 4,
    'April'     : 4,
    'May'       : 5,
    'Jun'       : 6,
    'June'      : 6,
    'Jul'       : 7,
    'July'      : 7,
    'Aug'       : 8,
    'August'    : 8,
    'Sep'       : 9,
    'September' : 9,
    'Oct'       : 10,
    'October'   : 10,
    'Nov'       : 11,
    'November'  : 11,
    'Dec'       : 12,
    'December'  : 12,
    }


## code ##

class DatefmtError(errors.ParserError): pass

class DateParserError(DatefmtError): pass

class TimeParserError(DatefmtError): pass

class DateFormatError(DatefmtError): pass

class DF(str):
    """date/time user-supplied format."""
    # indicator characters, highest to lowest.
    _valid_chars = "YMDhmsn"
    # explicit mapping from unit to character
    year                = 'Y'
    month               = 'M'
    day                 = 'D'
    hour                = 'h'
    minute              = 'm'
    second              = 's'
    microsecond         = 'n'

    def valid(self):
        return len(self) and len(set(self._valid_chars) & set(self))

    def lsb(self):
        """Smallest unit specified."""
        if not self.valid():
            raise DateFormatError(
                'An empty date format string has no least significant position.', self)

        return [i for i in self._valid_chars if i in self][-1]

    def msb(self):
        """Highest unit specified."""
        if not self.valid():
            raise DateFormatError(
                'An empty date format string has no most significant position.', self)

        return [i for i in self._valid_chars if i in self][0]

    def is_time(self):
        """Return true if only a time (hour/minute/second/ms) was specified."""
        return True if self.valid() and self.msb() not in 'YMD' else False

    def is_date(self):
        """Return true if only a date (year/month/day) was specified."""
        return True if self.valid() and self.lsb() not in 'hmsn' else False

# priorities
PRIORITIES_INT = {
    'p2': [
        DF(DF.day + DF.month),  # DM
        DF(DF.month + DF.year), # MY
        DF(DF.year + DF.month), # YM
        DF(DF.month + DF.day),  # MD
        DF(DF.day + DF.year),   # DY
        DF(DF.year + DF.day),   # YD
        ],
    'p3': [
        DF(DF.day + DF.month + DF.year), # DMY
        DF(DF.year + DF.month + DF.day), # YMD
        DF(DF.month + DF.day + DF.year), # MDY
        DF(DF.year + DF.day + DF.month), # YDM
        DF(DF.day + DF.year + DF.month), # DYM
        DF(DF.month + DF.year + DF.day), # MYD
        ]
    }

PRIORITIES_US = {
    'p2': [
        DF(DF.month + DF.day),
        DF(DF.year + DF.month),
        DF(DF.day + DF.month),
        DF(DF.month + DF.year),
        DF(DF.day + DF.year),
        DF(DF.year + DF.day),
        ],
    'p3': [
        DF(DF.month + DF.day + DF.year),
        DF(DF.year + DF.day + DF.month),
        DF(DF.day + DF.month + DF.year),
        DF(DF.year + DF.month + DF.day),
        DF(DF.day + DF.year + DF.month),
        DF(DF.month + DF.year + DF.day),
        ]
    }

def guess_date(tokens, priorities=None):
    """Guess the date from a string in an unknown format.

    The method uses the following clues to guess the meaning of each part:
    * 4-digits implies it's a year
    * 1-digit discards year (since it's more common to write 04 instead of 4 as a shorthand to 2004
    * Literal month
    * 'of' is preceeded by day and succeeded by the month
    * Any of (st, nd, rd, th) on a number makes it a day
    * Number > 12 can't be a month
    * Number > 31 can't be a day
    * Date inexistence (e.g. 29.02.2018)
    * precedence DMY > YMD > MDY > YDM
    * precedence DM > MY > YM > MD
    """
    priorities = PRIORITIES_INT if priorities is None else priorities

    # We need to figure out which token corresponds to which component
    # (D, M, Y). Since this is ambiguous, guesswork is needed. We do so
    # by eliminating impossible options.

    # initially, all three components are viable
    guesses = [Struct(tok=tok.strip(), fmt=DF.year + DF.month + DF.day) for tok in tokens]

    # check indicators for specific formats
    for idx in range(len(guesses)):
        tok, options = guesses[idx].tok, guesses[idx].fmt

        if len(tok) == 1 and tok in '.,;':
            # delimiter tokens can be ignored
            guesses[idx].fmt = ''
        elif tok == 'of':
            # an 'of' token indicates a 'day of month' structure
            guesses[idx-1].fmt = DF.day
            guesses[idx+1].fmt = DF.month
            guesses[idx].fmt = ''
        elif tok[-2:] in ('st', 'nd', 'rd', 'th'):
            # suffix indicates a day
            guesses[idx].fmt = DF.day
            guesses[idx].tok = tok[:-2]
        elif len(tok) == 4 and tok.isdigit():
            # four digits must be a year
            guesses[idx].fmt = DF.year
        elif tok in MONTH_LIT:
            # spelled out month is - tadaaa - a month
            guesses[idx].tok = str(MONTH_LIT[tok])
            guesses[idx].fmt = DF.month

    # remove jitter (of, delimiters)
    guesses = [itm for itm in guesses if len(itm.fmt) > 0]

    # eliminate impossible options
    for idx in range(len(guesses)):
        tok, options = guesses[idx].tok, guesses[idx].fmt

        if len(tok) == 1:
            # can't be a year
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.year, '')
        if tok.isdigit() and int(tok) > 12:
            # can't be a month
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.month, '')
        if tok.isdigit() and int(tok) > 31:
            # can't be a day
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.day, '')

    # define helper function
    def create_date(year, month, day):
        """Return a datetime for the given components or None if that is not possible."""
        # check format
        if DF.year not in year.fmt or DF.month not in month.fmt or DF.day not in day.fmt:
            return None

        if len(str(year.tok)) == 2:
            # ten years into the future is still the current century, otherwise the previous one
            threshold = ddate.today().year + 10 - 2000
            year = Struct(
                    tok='20'+str(year.tok) if int(year.tok) < threshold else '19'+str(year.tok),
                    fmt=year.fmt
                    )

        try:
            # create date
            return ddate(year=int(year.tok), month=int(month.tok), day=int(day.tok))
        except ValueError:
            return None

    # placeholders for unspecified tokens
    pyear = Struct(tok=ddate.today().year, fmt=DF.year)
    pday = Struct(tok=1, fmt=DF.day)
    pmon = Struct(tok=1, fmt=DF.month)

    if len(guesses) == 1: # one-part date (Y)
        itm = guesses[0]
        date = create_date(itm, pmon, pday)
        if date is not None:
            return date, DF(DF.year)
        else:
            raise DateParserError('Two-digit date format must contain the year')

    elif len(guesses) == 2: # two-part date (DM, MY, YM, MD)
        fst, snd = guesses
        # check components
        if len(set(fst.fmt + snd.fmt)) < 2:
            raise DateParserError('Invalid two-digit date format')

        if len(fst.fmt) == 1 and len(snd.fmt) == 1: # fully determined
            date = {
                DF.year: pyear,
                DF.month: pmon,
                DF.day: pday,
                }
            date.update({
                fst.fmt: fst,
                snd.fmt: snd,
                })
            return create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt)

        # walk through prioritized formats
        formats = {
            DF(DF.day + DF.month):  create_date(pyear, snd, fst), # DM
            DF(DF.month + DF.year): create_date(snd, fst, pday),  # MY
            DF(DF.year + DF.month): create_date(fst, snd, pday),  # YM
            DF(DF.month + DF.day):  create_date(pyear, fst, snd), # MD
            DF(DF.day + DF.year):   create_date(snd, pmon, fst),  # DY
            DF(DF.year + DF.day):   create_date(fst, pmon, snd),  # YD
        }

        for fmt in priorities['p2']:
            if formats.get(fmt, None) is not None:
                return formats[fmt], fmt

        raise DateParserError('Cannot guess roles of a two-digit date format')

    elif len(guesses) == 3: # three-part date (DMY, YMD, MDY, YMD)

        # eliminate options based on uniqueness of component assignment
        changed = True
        while changed:
            # resolved guesses: item has only one possible component option
            resolved = set([itm.fmt for itm in guesses if len(itm.fmt) == 1])
            # single choice: component has only one possible position
            unique = {comp for comp, freq in
                      Counter(flatten([set(itm.fmt) for itm in guesses])).items()
                      if freq == 1}
            # assume no changes
            changed = False
            for itm in guesses:
                if unique & set(itm.fmt) and not set(itm.fmt).issubset(unique):
                    # itm is the only option for one component
                    itm.fmt = DF(''.join(unique & set(itm.fmt)))
                    changed = True
                elif resolved & set(itm.fmt) and not set(itm.fmt).issubset(resolved):
                    # itm contains options that already taken by a different item
                    itm.fmt = itm.fmt.translate(str.maketrans('', '', ''.join(resolved)))
                    changed = True

        fst, snd, trd = guesses

        # check components
        if len(set(fst.fmt + snd.fmt + trd.fmt)) < 3:
            raise DateParserError('Invalid three-digit date format')

        if len(fst.fmt) == 1 and len(snd.fmt) == 1 and len(trd.fmt) == 1: # fully determined
            date = {
                fst.fmt: fst,
                snd.fmt: snd,
                trd.fmt: trd,
                }
            return (create_date(date[DF.year], date[DF.month], date[DF.day]),
                    DF(fst.fmt + snd.fmt + trd.fmt))

        # walk through prioritized formats
        formats = {
            DF(DF.day + DF.month + DF.year): create_date(year=trd, month=snd, day=fst), # DMY
            DF(DF.year + DF.month + DF.day): create_date(year=fst, month=snd, day=trd), # YMD
            DF(DF.month + DF.day + DF.year): create_date(year=trd, month=fst, day=snd), # MDY
            DF(DF.year + DF.day + DF.month): create_date(year=fst, month=trd, day=snd), # YDM
            DF(DF.day + DF.year + DF.month): create_date(year=snd, month=trd, day=fst), # DYM
            DF(DF.month + DF.year + DF.day): create_date(year=snd, month=fst, day=trd), # MYD
        }

        for fmt in priorities['p3']:
            if formats.get(fmt, None) is not None:
                return formats[fmt], fmt

        raise DateParserError('Cannot guess the roles of a three-digit date format')

    raise DateParserError('Cannot parse the date format')

def guess_time(tokens):
    """Guess the time from a string in an unknown format.

    * Always sorted from hi (hour) to low (sec)
    * 4 Terms -> hour, min, sec, ns
    * 3 Terms -> hour, min, sec
    * 2 Terms -> hour, min | min, sec
      * both terms > 24 -> min, sec
      * am or pm present -> hour, min
      * Dot separation -> min, sec
      * Colon separation -> hour, min
    """
    # remove spearators
    tokens = [tok.lower() for tok in tokens if tok not in '.,:']
    # check if the am/pm format was used
    is_am = 'am' in tokens
    is_pm = 'pm' in tokens

    # remove non-numbers
    tokens = [tok for tok in tokens if tok.isdigit()]
    if not len(tokens):
        raise TimeParserError()

    # convert to int
    ms     = int(tokens[-1].ljust(6, '0'))
    tokens = [int(tok) for tok in tokens]

    # guess format
    try:
        if len(tokens) == 4: # H:M:S.NS
            tokens[-1] = ms
            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second + DF.microsecond)
        elif len(tokens) == 3: # H:M:S
            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second)
        elif len(tokens) == 2: # H:M or M:S
            if is_am: # am/pm notation was used
                return dtime(*tokens), DF(DF.hour + DF.minute)
            elif is_pm: # am/pm notation was used
                return dtime(tokens[0] + 12, tokens[1]), DF(DF.hour + DF.minute)
            elif tokens[0] > 24: # min, sec
                return dtime(0, *tokens), DF(DF.minute + DF.second)
            else: # hour, sec
                return dtime(*tokens), DF(DF.hour + DF.minute)
        elif len(tokens) == 1: # H
            if is_am: # am/pm notation was used
                return dtime(tokens[0]), DF(DF.hour)
            elif is_pm: # am/pm notation was used
                return dtime(tokens[0] + 12), DF(DF.hour)
            else:
                return dtime(tokens[0], 0), DF(DF.hour)

    except ValueError:
        # invalid value was supplied, e.g. hour=85
        raise TimeParserError('Invalid value', tokens)

    raise TimeParserError('Unknown time format', tokens)

def guess_datetime(exp):
    """Return a datetime instance by guessing the components of a DATETIME parsed
    user-supplied date and/or time string. Guessing might be necessary since dates
    like 10.11.12 are ambiguous. *exp* is supposed to be a pyparsing.ParseResults
    instance as returned by DATETIME.parseString(...).
    """
    # For now I assumed unique separators (dot for date, colon for time, comma to separate the two)
    if 'date' in exp and 'time' in exp: # both parts present
        date, dfmt = guess_date(exp.date)
        time, tfmt = guess_time(exp.time)
        return datetime.combine(date, time), DF(dfmt+tfmt)
    elif 'date' in exp: # date present
        date, dfmt = guess_date(exp.date)
        return datetime(date.year, date.month, date.day), dfmt
    elif 'time' in exp: # time present
        time, tfmt = guess_time(exp.time)
        return datetime.combine(ddate.fromtimestamp(0), time), tfmt
    else:
        raise DateFormatError('Neither a date nor a time was found.')

def increment(date, fmt):
    """Increment the LSB of a datetime instance by one."""
    if fmt == '' or not fmt.valid():
        raise DateFormatError('Invalid date format string', fmt)
    elif fmt.lsb() == fmt.microsecond: # 5.11.2012, 06:24:18.25 -> 5.11.2012, 06:25:18.26
        return date + relativedelta(microseconds=1)
    elif fmt.lsb() == fmt.second: # 5.11.2012, 06:24:18 -> 5.11.2012, 06:25:19
        return date + relativedelta(seconds=1, microsecond=0)
    elif fmt.lsb() == fmt.minute: # 5.11.2012, 06:24 -> 5.11.2012, 06:25
        return date + relativedelta(minutes=1, second=0, microsecond=0)
    elif fmt.lsb() == fmt.hour: # 5.11.2012, 06am -> 5.11.2012, 07:00
        return date + relativedelta(hours=1, minute=0, second=0, microsecond=0)
    elif fmt.lsb() == fmt.day: # 5.11.2012 -> 6.11.2012, 00:00
        return date + relativedelta(days=1, hour=0, minute=0, second=0, microsecond=0)
    elif fmt.lsb() == fmt.month: # 11.2012 -> 1.12.2012
        return date + relativedelta(months=1, day=1, hour=0, minute=0, second=0, microsecond=0)
    else: # fmt.lsb() == fmt.year: # 2012 -> 1.1.2013, 00:00
        return date + relativedelta(
                years=1, month=1, day=1, hour=0, minute=0, second=0, microsecond=0)

class DateTimeParser():

    DATE = None
    TIME = None
    DATETIME = None

    def build_parser(self):
        """
        DATE        := YMD | DMY | MDY | YDM
        YMD         := YEAR SEP MON SEP DAY
        DMY         := DAY SEP [of] MON SEP YEAR
        MDY         := MON SEP DAY SEP YEAR
        YDM         := YEAR SEP DAY [of] MON
        DM          := DAY SEP [of] MON
        YM          := YEAR SEP MON
        MY          := MON SEP YEAR
        MD          := MON SEP DAY
        DAY         := [D]D | [D]D st | [D]D nd | [D]D rd | [D]D th
        MON         := [M]M | [month]
        YEAR        := [YY]YY
        SEP         := . | , | [whitespace]
        {D,M,Y}     := [digit]
        """
        # FIXME: Allow more patterns (e.g. 2012, 10; April, 5th; April, 2020)
        sep = Literal('.') # FIXME: Allow '. - :'
        year = Word(nums, exact=2) ^ Word(nums, exact=4)
        month = Word(nums, min=1, max=2) ^ oneOf(list(MONTH_LIT.keys()))
        day = Combine(Word(nums, min=1, max=2) + Optional(oneOf('st nd rd th')))
        # three-part-date
        YMD     = year + sep + month + sep + day
        DMY     = day + (sep ^ 'of') + month + sep + year
        MDY     = month + sep + day + sep + year
        YDM     = year + sep + day + (sep ^ 'of') + month
        # two-part-date
        DM      = day + (sep ^ 'of')+ month
        YM      = year + sep + month
        MY      = month + sep + year
        MD      = month + sep + day
        Y       = Word(nums, exact=4)
        # date parser
        self.DATE = Group(YMD | DMY | YDM | MDY | DM | YM | MY | MD | Y).setResultsName('date')

        """
        TIME        := HOUR SEP MIN [SEP SEC [. MS]] | HOUR SEP MIN | HOUR [SEP MIN] {am|pm}
        HOUR        := [H]H
        MIN         := [M]M
        SEC         := [S]S
        {H,M,S}     := [digit]
        SEP         := : | . | ,
        """
        sep = Literal(':') # FIXME: Allow '. : -'
        HMS = Word(nums, min=1, max=2)
        MS = Word(nums, min=1)
        # time parser
        self.TIME = Group(HMS + sep + HMS + sep + HMS + oneOf('. :') + MS \
                | HMS + sep + HMS + sep + HMS \
                | HMS + Optional(sep + HMS) + oneOf('am pm') \
                | HMS + sep + HMS ).setResultsName('time')

        """
        DATETIME    := DATE | TIME | DATE SEP TIME | TIME SEP DATE
        SEP         := , [whitespace]
        """
        self.DATETIME = Group(
                        self.DATE \
                        ^ self.TIME \
                        ^ self.DATE + Optional(',') + self.TIME \
                        ^ self.TIME + Optional(',') + self.DATE \
                        ).setResultsName('datetime')
        return self

    def __call__(self, datestr):
        if self.DATETIME is None:
            self.build_parser()

        try:
            date, fmt = guess_datetime(self.DATETIME.parseString(datestr, parseAll=True)[0])
            return date

        except ParseException as e:
            raise errors.ParserError('Cannot parse query', e)


"""Default DateTimeParser instance.

To produce an datetime, call

>>> parse_datetime(datestring)

Convenience shortcut for

>>> DateTimeParser().parse(datestring)

"""
parse_datetime = DateTimeParser().build_parser()

## EOF ##
