/*
   pattern.c - Wildcard pattern matching.

   Jason Hood, 8 November, 2002.

   Initially based on regexpr.c by Vesa Kolhinen.

   v1.01 - 6 & 7 April, 2003:
     add literal groups to the meta arrays;
     recognise the empty trailing '*' subpattern;
     fixed bug in DOS make_appr_mask() incorrectly removing trailing ".*".
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "letters.h"
#include "lfn.h"
#include "pattern.h"

#ifndef _WIN32
#include <dir.h>
#endif


substr subpat[MAX_SUBPATS];
int    subpats;
int    meta[5][MAX_METAS];
int    metas[5];

// Allow 15 sets: 0 is the NUL terminator; fifth bit indicates optional.
#define MAX_SETS 15
#define OPT_SET  16
char class[MAX_SETS][32];
int  sets;

char casemap[256];
int  caseless = 1;


/*
   "Compile" a set into bits. There are 256 characters, which divides into
   32 bytes of 8 bits. If a character is in the set, its corresponding bit
   is set. A negated set is compiled as normal, then the bits are inverted
   at the end.

   s points to the set to compile (past the opening '[');

   Returns NULL if the set could not be compiled (no closing ']');
   otherwise a pointer to the end of the set (the closing ']').
*/
char* compile_set( char* s )
{
  unsigned char* p = (unsigned char*)s;
  int	c, end;
  unsigned char  u;
  char* set = class[sets++];
  int	negate = 0;

  c = *p;
  if (c == '!' || c == '^')
  {
    negate = 1;
    c = *++p;
  }

  // Special case for [?] to optionally match any character.
  if (c == '?' && p[1] == ']')
  {
    memset( set, 0xff, 32 );
    sets |= OPT_SET;
    ++p;
    negate = 0;
  }
  else
  do
  {
    if (c == '\0')
      return NULL;

    if (c == '$')
    {
      end = c = *++p;
      if (c == '\0')
	return NULL;
    }
    else if (p[1] == '-' && p[2] && p[2] != ']')
    {
      end = *(p += 2);
      if (end == '$')
      {
	end = *++p;
	if (end == '\0')
	  return NULL;
      }
      if (end < c)
      {
	int t = c;
	c = end;
	end = t;
      }
    }
    else if (c == '#')
    {
      c   = '0';
      end = '9';
    }
    else if (c == '?')
    {
      sets |= OPT_SET;
      end = 0;
    }
    else
    {
      end = c;
    }

    for (; c <= end; ++c)
    {
      if (caseless)
      {
	u = upper( c );
	set[u >> 3] |= 1 << (u & 7);
	u = lower( c );
      }
      else
      {
	u = c;
      }
      set[u >> 3] |= 1 << (u & 7);
    }
    c = *++p;
  }
  while (c != ']');

  if (negate)
    for (c = 0; c < 32; ++c)
      set[c] ^= 0xff;

  return (char*)p;
}


/*
   Verify and compile a pattern.

   Compiling a pattern removes redundant sequences (such as multiple colons),
   translates sets into control characters and '#' into '|' (which are
   illegal in filenames) and removes the '$' escape character.

   Returns the number of subpatterns, or 0 for an invalid pattern (invalid
   range, too many ranges, too many subpatterns, too many wildcards).
*/
int compile_pattern( char* mask, char* pattern )
{
  char c;
  int  subs;
  int  extensions = 0;
  int  text = 1;

  // Build the case conversion map.
  for (subs = 0; subs < 256; ++subs)
    casemap[subs] = (caseless) ? upper( subs ) : subs;

  subs = 1;			// $0 is implicit
  while (*mask == ':')          // Ignore useless leading subpatterns
    ++mask;
  while (*mask)
  {
    c = *pattern++ = *mask++;
    switch (c)
    {
      case '*': c = 0; break;
      case '?': c = 1; break;
      case '#': c = 2; break;
      case '[': c = 3; break;
    }
    if (c < 4)
    {
      if (metas[c] == MAX_METAS)
	return 0;
      meta[c][metas[c]++] = subs++;
      text = 1;
      if (c == 0) // '*'
      {
	if (*mask == '*')
	{
	  *pattern++ = '*';
	  ++mask;
	}
	// Collapse multiple stars and subpatterns.
	while (*mask == '*' || *mask == ':')
	  ++mask;
      }
      else if (c == 2) // '#'
      {
	pattern[-1] = '|';
      }
      else if (c == 3) // '['
      {
	if (sets == MAX_SETS || (mask = compile_set( mask )) == NULL)
	  return 0;
	pattern[-1] = sets;
	sets &= ~OPT_SET;
	++mask;
      }
    }
    else if (c == ':')
    {
      // Ignore subpatterns before or after a wildcard.
      if (!*mask || *mask == ':' || *mask == '*' ||
	  *mask == '?' || *mask == '#' || *mask == '[' ||
	  pattern[-2] == '*' || pattern[-2] == '?' || pattern[-2] == '|' ||
	  pattern[-2] < ' ')
      {
	--pattern;
      }
      else
      {
	text = 1;
      }
    }
    else
    {
      if (text)
      {
	text = 0;
	if (metas[4] == MAX_METAS)
	  return 0;
	meta[4][metas[4]++] = subs++;
      }
      if (c == '$' && *mask)
      {
	c = pattern[-1] = *mask++;
      }
      if (c == '.')
	++extensions;
    }
  }

  // Remove extraneous trailing dots.
  if (extensions > 1)
    while (pattern[-1] == '.')
      --pattern;

  *pattern = '\0';

  if (subs > MAX_SUBPATS)
    return 0;

  return subs;
}


/*
   Match string s against pattern p. A '*' will match zero or more characters,
   '?' will match exactly one character, '[set]' will match one character from
   the set of characters, '[!set]' will match one character not in the set of
   characters, '#' will match a digit, ':' will start a new subpattern.

   A set is given by characters inside square brackets: [abc] will match
   any one of 'a', 'b' or 'c'. If the set starts with '!' or '^', it is
   negated and will match any character NOT in the set. A range of characters
   can be specified by using a '-': [0-9] will match any digit. To match a
   literal '-' include it as the first or last character of the set; to match
   a literal ']' include it first. '-' and ']' can also be escaped by prec-
   eding with '$'; this is the only way ']' can be seen as the end of a range.
   '#' is shorthand for "0-9": [!#] will match anything not a digit.
   If the set contains '?' matching the character is optional.

   The pattern should have been compiled prior to calling this routine.
*/
int matchstr( char* s, char* p )
{
  int start = subpats;
  int star;
  int text = 1;

  while (*p)
  {
    if (*p == '*')
    {
      star = subpats++;
      subpat[star].str = s;
      if (*++p && (*p != '*' || p[1]))
      {
	if (*p != '*')  // Greedy (maximized) match
	{
	  s += strlen( s );
	  if (*p == '?' || *p == '|' || *p < ' ')
	  {
	    while (!matchstr( --s, p ))
	      if (s == subpat[star].str)
		goto no_match;
	  }
	  else
	  {
	    for (;;)
	    {
	      if (casemap[(unsigned char)*--s] == *p)
		if (matchstr( s, p ))
		  break;
	      if (s == subpat[star].str)
		goto no_match;
	    }
	  }
	}
	else		// Non-greedy (minimized) match
	{
	  ++p;
	  if (*p == '?' || *p == '|' || *p < ' ')
	  {
	    while (!matchstr( s, p ))
	      if (!*++s)
		goto no_match;
	  }
	  else
	  {
	    for (;;)
	    {
	      if (casemap[(unsigned char)*s] == *p)
		if (matchstr( s, p ))
		  break;
	      if (!*++s)
		goto no_match;
	    }
	  }
	}
	subpat[star].len = (int)(s - subpat[star].str);
      }
      else
      {
	subpat[star].len = strlen( s );
      }
      return 1;
    }

    if (!*s)
      goto no_match;

    if (*p == '?')
    {
      subpat[subpats].str = s;
      subpat[subpats++].len = 1;
      text = 1;
    }
    else if (*p == '|')
    {
      if (*s > '9' || *s < '0')
	goto no_match;
      subpat[subpats].str = s;
      subpat[subpats++].len = 1;
      text = 1;
    }
    else if (*p < ' ')
    {
      unsigned char c = *s;
      int set = *p - 1;
      if (!(class[set & ~OPT_SET][c >> 3] & (1 << (c & 7))))
      {
	if (set & OPT_SET)
	{
	  subpat[subpats].str = "";
	  subpat[subpats++].len = 0;
	  --s;
	  text = 1;
	}
	else
	{
	  goto no_match;
	}
      }
      else
      {
	subpat[subpats].str = s;
	subpat[subpats++].len = 1;
	text = 1;
	if (set & OPT_SET)
	{
	  // It matched, but since it's optional, maybe it shouldn't have.
	  if (matchstr( s+1, p+1 ))
	    return 1;
	  subpat[subpats-1].str = "";
	  subpat[subpats-1].len = 0;
	  --s;
	}
      }
    }
    else if (*p == ':')
    {
      text = 1;
      --s;
    }
    else if (casemap[(unsigned char)*s] != *p)
    {
      goto no_match;
    }
    else
    {
      if (text)
      {
	subpat[subpats].str = s;
	subpat[subpats++].len = 0;
	text = 0;
      }
      ++subpat[subpats-1].len;
    }

    ++p;
    if (!*++s)
    {
      if (*p)
      {
	do
	  if (*p != '*')
	    goto no_match;
	while (*++p);
	subpat[subpats++].len = 0;
      }
    }
  }

  if (*s)
  {
  no_match:
    subpats = start;
    return 0;
  }

  return 1;
}


/*
   Translate a pattern into the equivalent DOS mask.
   The pattern should have already been compiled.

   DOS does not allow anything after a '*' (except the extension).
   The LFN search mask only needs to translate the set into a '?',
   since '*' is allowed anywhere. '?' will also match the dot, so
   let's provide that behaviour in DOS. An optional set has to be
   translated to a '*'.
*/
void make_appr_mask( char* mask, char* approx_mask )
{
  if (use_lfn)
  {
    while (*mask)
    {
      if (*mask == '|' || *mask <= MAX_SETS)
	*approx_mask++ = '?';
      else if (*mask < ' ')
	*approx_mask++ = '*';
      else if (*mask != ':')
	*approx_mask++ = *mask;
      ++mask;
    }
  }
  else
  {
    char* dot = strchr( mask, '.' );
    while (*mask)
    {
      if (*mask == '*' || (*mask < ' ' && (*mask & OPT_SET)))
      {
	*approx_mask++ = '*';
	if (dot)
	{
	  if (mask > dot)
	  {
	    mask = strchr( mask, '\0' );
	    break;
	  }
	  *approx_mask++ = '.';
	  mask = dot;
	}
	else
	{
	  *approx_mask++ = '.';
	  *approx_mask++ = '*';
	  break;
	}
	if (*mask == '*' && mask[1] == '*')
	  ++mask;
      }
      else if (*mask == '?' || *mask < ' ')
      {
	if (dot)
	{
	  *approx_mask++ = '?';
	}
	else
	{
	  *approx_mask++ = '*';
	  *approx_mask++ = '.';
	  *approx_mask++ = '*';
	  break;
	}
      }
      else if (*mask == '|')
      {
	*approx_mask++ = '?';
      }
      else if (*mask == ':')
      {
	// do nothing
      }
      else
      {
	*approx_mask++ = *mask;
      }
      ++mask;
    }
  }
  *approx_mask = '\0';

  // DOS will find all the files without extensions, but the filename
  // does not include the dot, so remove the dot from the pattern.
  if (*--mask == '.')
    *mask = '\0';
}


/*
   Find the first or subsequent files based on a pattern and attributes.

   pattern: the compiled pattern
   mask:    the equivalent DOS pattern
*/
int efindfirst( char* pattern, char* mask, struct my_ffblk* ffblk, int attrib )
{
  int done;

  if (*mask)
  {
    done = my_findfirst( mask, ffblk, attrib );

    // Play it risky: if finding directories and the first match is ".", then
    // assume the next match will be ".." and skip it immediately.
    if (!done && (attrib & FA_DIREC) && *ffblk->name == '.')
    {
      if (ffblk->name[1] == '\0')
      {
	my_findnext( ffblk );		// Skip the ".." entry
	done = my_findnext( ffblk );	// Start matching from here
      }
      else if (*(short*)(ffblk->name + 1) == '.')  // ".." matched first
      {
	done = my_findnext( ffblk );
      }
    }
  }
  else
  {
    done = my_findnext( ffblk );
  }

  while (!done)
  {
    subpats = 1;
    if (matchstr( ffblk->name, pattern ))
    {
      subpat[0].str = ffblk->name;
      subpat[0].len = strlen( ffblk->name );
      return 0;
    }
    done = my_findnext( ffblk );
  }

  return -1;
}
