# HG changeset patch # User Sergey Poznyakoff # Date 1250092987 -10800 # Node ID 451cf730f520105821c95cf1dece30496f80d243 # Parent d2c9dd1850029f7f60604ece0b084e2a9716dc03 Optimize exclude: use hash tables for non-wildcard patterns. * lib/exclude.c: Include hash.h and mbuiter.h (struct exclude_pattern, exclude_segment): New data types. (struct exclude): Rewrite. (fnmatch_pattern_has_wildcards): New function. (new_exclude_segment, free_exclude_segment): New functions. (excluded_file_pattern_p, excluded_file_name_p): New functions. (excluded_file_name, add_exclude): Rewrite using new struct exclude. * lib/exclude.h (is_fnmatch_pattern): New prototype. * modules/exclude: Depend on hash and mbuiter. * modules/exclude-tests: New file. * tests/test-exclude.c: New file. * tests/test-exclude1.sh: New file. * tests/test-exclude2.sh: New file. * tests/test-exclude3.sh: New file. * tests/test-exclude4.sh: New file. * tests/test-exclude5.sh: New file. * tests/test-exclude6.sh: New file. * tests/test-exclude7.sh: New file. diff --git a/lib/exclude.c b/lib/exclude.c --- a/lib/exclude.c +++ b/lib/exclude.c @@ -1,7 +1,7 @@ /* exclude.c -- exclude file names Copyright (C) 1992, 1993, 1994, 1997, 1999, 2000, 2001, 2002, 2003, - 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,7 +16,10 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* Written by Paul Eggert */ +/* Written by Paul Eggert + and Sergey Poznyakoff . + Thanks to Phil Proudman + for improvement suggestions. */ #include @@ -30,6 +33,8 @@ #include #include "exclude.h" +#include "hash.h" +#include "mbuiter.h" #include "fnmatch.h" #include "xalloc.h" #include "verify.h" @@ -54,6 +59,14 @@ | FNM_CASEFOLD | FNM_EXTMATCH)) == 0); + +/* Exclusion patterns are grouped into a singly-linked list of + "exclusion segments". Each segment represents a set of patterns + that can be matches using the same algorithm. Non-wildcard + patterns are kept in hash tables, to speed up searches. Wildcard + patterns are stored as arrays of patterns. */ + + /* An exclude pattern-options pair. The options are fnmatch options ORed with EXCLUDE_* options. */ @@ -63,15 +76,63 @@ int options; }; -/* An exclude list, of pattern-options pairs. */ +/* An array of pattern-options pairs. */ -struct exclude +struct exclude_pattern { struct patopts *exclude; size_t exclude_alloc; size_t exclude_count; }; +enum exclude_type + { + exclude_hash, /* a hash table of excluded names */ + exclude_pattern /* an array of exclude patterns */ + }; + +struct exclude_segment + { + struct exclude_segment *next; /* next segment in list */ + enum exclude_type type; /* type of this segment */ + int options; /* common options for this segment */ + union + { + Hash_table *table; /* for type == exclude_hash */ + struct exclude_pattern pat; /* for type == exclude_pattern */ + } v; + }; + +/* The exclude structure keeps a singly-linked list of exclude segments */ +struct exclude + { + struct exclude_segment *head, *tail; + }; + +/* Return true if str has wildcard characters */ +bool +fnmatch_pattern_has_wildcards (const char *str, int options) +{ + char *cset = "\\?*[]"; + if (options & FNM_NOESCAPE) + cset++; + while (*str) + { + size_t n = strcspn (str, cset); + if (str[n] == 0) + break; + else if (str[n] == '\\') + { + str += n + 1; + if (*str) + str++; + } + else + return true; + } + return false; +} + /* Return a newly allocated and empty exclude list. */ struct exclude * @@ -80,12 +141,122 @@ return xzalloc (sizeof *new_exclude ()); } +/* Calculate the hash of string. */ +static size_t +string_hasher (void const *data, size_t n_buckets) +{ + char const *p = data; + return hash_string (p, n_buckets); +} + +/* Ditto, for case-insensitive hashes */ +static size_t +string_hasher_ci (void const *data, size_t n_buckets) +{ + char const *p = data; + mbui_iterator_t iter; + size_t value = 0; + + for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter)) + { + mbchar_t m = mbui_cur (iter); + wchar_t wc; + + if (m.wc_valid) + wc = towlower (m.wc); + else + wc = *m.ptr; + + value = (value * 31 + wc) % n_buckets; + } + + return value; +} + +/* compare two strings for equality */ +static bool +string_compare (void const *data1, void const *data2) +{ + char const *p1 = data1; + char const *p2 = data2; + return strcmp (p1, p2) == 0; +} + +/* compare two strings for equality, case-insensitive */ +static bool +string_compare_ci (void const *data1, void const *data2) +{ + char const *p1 = data1; + char const *p2 = data2; + return mbscasecmp (p1, p2) == 0; +} + +static void +string_free (void *data) +{ + free (data); +} + +/* Create new exclude segment of given TYPE and OPTIONS, and attach it + to the tail of list in EX */ +struct exclude_segment * +new_exclude_segment (struct exclude *ex, enum exclude_type type, int options) +{ + struct exclude_segment *sp = xzalloc (sizeof (struct exclude_segment)); + sp->type = type; + sp->options = options; + switch (type) + { + case exclude_pattern: + break; + + case exclude_hash: + sp->v.table = hash_initialize (0, NULL, + (options & FNM_CASEFOLD) ? + string_hasher_ci + : string_hasher, + (options & FNM_CASEFOLD) ? + string_compare_ci + : string_compare, + string_free); + break; + } + if (ex->tail) + ex->tail->next = sp; + else + ex->head = sp; + ex->tail = sp; + return sp; +} + +/* Free a single exclude segment */ +static void +free_exclude_segment (struct exclude_segment *seg) +{ + switch (seg->type) + { + case exclude_pattern: + free (seg->v.pat.exclude); + break; + + case exclude_hash: + hash_free (seg->v.table); + break; + } + free (seg); +} + /* Free the storage associated with an exclude list. */ - void free_exclude (struct exclude *ex) { - free (ex->exclude); + struct exclude_segment *seg; + for (seg = ex->head; seg; ) + { + struct exclude_segment *next = seg->next; + free_exclude_segment (seg); + seg = next; + } free (ex); } @@ -155,36 +326,113 @@ return matched; } +/* Return true if the exclude_pattern segment SEG excludes F. */ + +bool +excluded_file_pattern_p (struct exclude_segment const *seg, char const *f) +{ + size_t exclude_count = seg->v.pat.exclude_count; + struct patopts const *exclude = seg->v.pat.exclude; + size_t i; + bool excluded = !! (exclude[0].options & EXCLUDE_INCLUDE); + + /* Scan through the options, until they change excluded */ + for (i = 0; i < exclude_count; i++) + { + char const *pattern = exclude[i].pattern; + int options = exclude[i].options; + if (excluded != exclude_fnmatch (pattern, f, options)) + return !excluded; + } + return excluded; +} + +/* Return true if the exclude_hash segment SEG excludes F. + BUFFER is an auxiliary storage of the same length as F (with nul + terminator included) */ +bool +excluded_file_name_p (struct exclude_segment const *seg, char const *f, + char *buffer) +{ + int options = seg->options; + bool excluded = !! (options & EXCLUDE_INCLUDE); + Hash_table *table = seg->v.table; + + do + { + /* initialize the pattern */ + strcpy (buffer, f); + + while (1) + { + if (hash_lookup (table, buffer)) + return !excluded; + if (options & FNM_LEADING_DIR) + { + char *p = strrchr (buffer, '/'); + if (p) + { + *p = 0; + continue; + } + } + break; + } + + if (!(options & EXCLUDE_ANCHORED)) + { + f = strchr (f, '/'); + if (f) + f++; + } + else + break; + } + while (f); + return excluded; +} + /* Return true if EX excludes F. */ bool excluded_file_name (struct exclude const *ex, char const *f) { - size_t exclude_count = ex->exclude_count; + struct exclude_segment *seg; + bool excluded; + char *filename = NULL; - /* If no options are given, the default is to include. */ - if (exclude_count == 0) + /* If no patterns are given, the default is to include. */ + if (!ex->head) return false; - else + + /* Otherwise, the default is the opposite of the first option. */ + excluded = !! (ex->head->options & EXCLUDE_INCLUDE); + /* Scan through the segments, seeing whether they change status from + excluded to included or vice versa. */ + for (seg = ex->head; seg; seg = seg->next) { - struct patopts const *exclude = ex->exclude; - size_t i; - - /* Otherwise, the default is the opposite of the first option. */ - bool excluded = !! (exclude[0].options & EXCLUDE_INCLUDE); + bool rc; - /* Scan through the options, seeing whether they change F from - excluded to included or vice versa. */ - for (i = 0; i < exclude_count; i++) + switch (seg->type) { - char const *pattern = exclude[i].pattern; - int options = exclude[i].options; - if (excluded == !! (options & EXCLUDE_INCLUDE)) - excluded ^= exclude_fnmatch (pattern, f, options); + case exclude_pattern: + rc = excluded_file_pattern_p (seg, f); + break; + + case exclude_hash: + if (!filename) + filename = xmalloc (strlen (f) + 1); + rc = excluded_file_name_p (seg, f, filename); + break; } - - return excluded; + if (rc != excluded) + { + excluded = rc; + break; + } } + free (filename); + return excluded; } /* Append to EX the exclusion PATTERN with OPTIONS. */ @@ -192,15 +440,46 @@ void add_exclude (struct exclude *ex, char const *pattern, int options) { - struct patopts *patopts; + struct exclude_segment *seg; + + if ((options & EXCLUDE_WILDCARDS) + && fnmatch_pattern_has_wildcards (pattern, options)) + { + struct exclude_pattern *pat; + struct patopts *patopts; + + if (ex->tail && ex->tail->type == exclude_pattern + && ((ex->tail->options & EXCLUDE_INCLUDE) == + (options & EXCLUDE_INCLUDE))) + seg = ex->tail; + else + seg = new_exclude_segment (ex, exclude_pattern, options); - if (ex->exclude_count == ex->exclude_alloc) - ex->exclude = x2nrealloc (ex->exclude, &ex->exclude_alloc, - sizeof *ex->exclude); + pat = &seg->v.pat; + if (pat->exclude_count == pat->exclude_alloc) + pat->exclude = x2nrealloc (pat->exclude, &pat->exclude_alloc, + sizeof *pat->exclude); + patopts = &pat->exclude[pat->exclude_count++]; + patopts->pattern = pattern; + patopts->options = options; + } + else + { + char *str, *p; +#define EXCLUDE_HASH_FLAGS (EXCLUDE_INCLUDE|EXCLUDE_ANCHORED|\ + FNM_LEADING_DIR|FNM_CASEFOLD) + if (ex->tail && ex->tail->type == exclude_hash + && ((ex->tail->options & EXCLUDE_HASH_FLAGS) == + (options & EXCLUDE_HASH_FLAGS))) + seg = ex->tail; + else + seg = new_exclude_segment (ex, exclude_hash, options); - patopts = &ex->exclude[ex->exclude_count++]; - patopts->pattern = pattern; - patopts->options = options; + str = xstrdup (pattern); + p = hash_insert (seg->v.table, str); + if (p != str) + free (str); + } } /* Use ADD_FUNC to append to EX the patterns in FILE_NAME, each with diff --git a/lib/exclude.h b/lib/exclude.h --- a/lib/exclude.h +++ b/lib/exclude.h @@ -1,7 +1,7 @@ /* exclude.h -- declarations for excluding file names Copyright (C) 1992, 1993, 1994, 1997, 1999, 2001, 2002, 2003, 2005, - 2006 Free Software Foundation, Inc. + 2006, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,7 +16,8 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* Written by Paul Eggert */ +/* Written by Paul Eggert + and Sergey Poznyakoff */ /* Exclude options, which can be ORed with fnmatch options. */ @@ -33,6 +34,8 @@ struct exclude; +bool fnmatch_pattern_has_wildcards (const char *, int); + struct exclude *new_exclude (void); void free_exclude (struct exclude *); void add_exclude (struct exclude *, char const *, int); diff --git a/modules/exclude b/modules/exclude --- a/modules/exclude +++ b/modules/exclude @@ -8,7 +8,9 @@ Depends-on: fnmatch +hash mbscasecmp +mbuiter stdbool verify xalloc diff --git a/modules/exclude-tests b/modules/exclude-tests new file mode 100644 --- /dev/null +++ b/modules/exclude-tests @@ -0,0 +1,28 @@ +Files: +tests/test-exclude.c +tests/test-exclude1.sh +tests/test-exclude2.sh +tests/test-exclude3.sh +tests/test-exclude4.sh +tests/test-exclude5.sh +tests/test-exclude6.sh +tests/test-exclude7.sh + +Depends-on: +progname +error +argmatch + +Makefile.am: +TESTS += \ + test-exclude1.sh\ + test-exclude2.sh\ + test-exclude3.sh\ + test-exclude4.sh\ + test-exclude5.sh\ + test-exclude6.sh\ + test-exclude7.sh + +TESTS_ENVIRONMENT += EXEEXT='@EXEEXT@' +check_PROGRAMS += test-exclude +test_exclude_LDADD = $(LDADD) @LIBINTL@ diff --git a/tests/test-exclude.c b/tests/test-exclude.c new file mode 100644 --- /dev/null +++ b/tests/test-exclude.c @@ -0,0 +1,111 @@ +/* Test suite for exclude. + Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of the GNUlib Library. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include +#include +#include +#include +#include +#include + +#include "exclude.h" +#include "progname.h" +#include "error.h" +#include "argmatch.h" + +#ifndef FNM_CASEFOLD +# define FNM_CASEFOLD 0 +#endif +#ifndef FNM_LEADING_DIR +# define FNM_LEADING_DIR 0 +#endif + +char const * const exclude_keywords[] = { + "noescape", + "pathname", + "period", + "leading_dir", + "casefold", + "anchored", + "include", + "wildcards", + NULL +}; + +int exclude_flags[] = { + FNM_NOESCAPE, + FNM_PATHNAME, + FNM_PERIOD, + FNM_LEADING_DIR, + FNM_CASEFOLD, + EXCLUDE_ANCHORED, + EXCLUDE_INCLUDE, + EXCLUDE_WILDCARDS +}; + +ARGMATCH_VERIFY (exclude_keywords, exclude_flags); + +int +main (int argc, char **argv) +{ + int exclude_options = 0; + struct exclude *exclude = new_exclude (); + + set_program_name (argv[0]); + + if (argc == 1) + error (1, 0, "usage: %s file -- words...", argv[0]); + + while (--argc) + { + char *opt = *++argv; + if (opt[0] == '-') + { + int neg = 0; + int flag; + char *s = opt + 1; + + if (opt[1] == '-' && opt[2] == 0) + { + argc--; + break; + } + if (strlen (s) > 3 && memcmp (s, "no-", 3) == 0) + { + neg = 1; + s += 3; + } + flag = XARGMATCH (opt, s, exclude_keywords, exclude_flags); + if (neg) + exclude_options &= ~flag; + else + exclude_options |= flag; + } + else if (add_exclude_file (add_exclude, exclude, opt, + exclude_options, '\n') != 0) + error (1, errno, "error loading %s", opt); + } + + for (; argc; --argc) + { + char *word = *++argv; + + printf ("%s: %d\n", word, excluded_file_name (exclude, word)); + } + return 0; +} diff --git a/tests/test-exclude1.sh b/tests/test-exclude1.sh new file mode 100755 --- /dev/null +++ b/tests/test-exclude1.sh @@ -0,0 +1,43 @@ +#! /bin/sh +# Test suite for exclude. +# Copyright (C) 2009 Free Software Foundation, Inc. +# This file is part of the GNUlib Library. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test literal matches + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test include + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test wildcard matching + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test FNM_LEADING_DIR + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test anchored + +cat > $LIST < $TMP <. + +TMP=excltmp.$$ +LIST=flist.$$ +ERR=0 + +# Test exclude precedence + +cat > $LIST < $TMP <$TMP.1 +./test-exclude$EXEEXT -include $LIST -no-include $LIST -- bar >>$TMP.1 + +diff -c $TMP $TMP.1 || ERR=1 + +rm -f $TMP $TMP.1 $LIST +exit $ERR