upstream/mercurial-mirror Files · mercurial/pathencode.c

minirst: CGI escape strings prior to embedding it in the HTML

Matt Mackall - - Load All Authors

File last commit:

r18452:8bd338c7 default


                r18750:c9d923f5

default

Download file

             pathencode.c
        
                    760 lines
            
             | 16.3 KiB
            
                | text/x-c
            
             |
                CLexer
            
             / mercurial / pathencode.c
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Adrian Buehlmann
    
pathencode: new C module with fast encodedir() function...

              r17606
            
      /*

       pathencode.c - efficient path name encoding

       Copyright 2012 Facebook

       This software may be used and distributed according to the terms of

       the GNU General Public License, incorporated herein by reference.

      */

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      /*

       * An implementation of the name encoding scheme used by the fncache

       * store.  The common case is of a path < 120 bytes long, which is

       * handled either in a single pass with no allocations or two passes

       * with a single allocation.  For longer paths, multiple passes are

       * required.

       */

        Bryan O'Sullivan
    
store: implement lowerencode in C

              r18430
            
      #define PY_SSIZE_T_CLEAN

        Adrian Buehlmann
    
pathencode: new C module with fast encodedir() function...

              r17606
            
      #include <Python.h>

      #include <assert.h>

      #include <ctype.h>

      #include <stdlib.h>

      #include <string.h>

      #include "util.h"

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      /* state machine for the fast path */

      enum path_state {

      	START,   /* first byte of a path component */

      	A,       /* "AUX" */

      	AU,

      	THIRD,   /* third of a 3-byte sequence, e.g. "AUX", "NUL" */

      	C,       /* "CON" or "COMn" */

      	CO,

      	COMLPT,  /* "COM" or "LPT" */

      	COMLPTn,

      	L,

      	LP,

      	N,

      	NU,

      	P,       /* "PRN" */

      	PR,

      	LDOT,    /* leading '.' */

      	DOT,     /* '.' in a non-leading position */

      	H,       /* ".h" */

      	HGDI,    /* ".hg", ".d", or ".i" */

      	SPACE,

      	DEFAULT, /* byte of a path component after the first */

      };

        Adrian Buehlmann
    
pathencode: new C module with fast encodedir() function...

              r17606
            
      /* state machine for dir-encoding */

      enum dir_state {

      	DDOT,

      	DH,

      	DHGDI,

      	DDEFAULT,

      };

        André Sintzoff
    
pathencode: change isset name to avoid name collision...

              r17699
            
      static inline int inset(const uint32_t bitset[], char c)

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      {

      	return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));

      }

        Adrian Buehlmann
    
pathencode: new C module with fast encodedir() function...

              r17606
            
      static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,

                                  char c)

      {

      	if (dest) {

      		assert(*destlen < destsize);

      		dest[*destlen] = c;

      	}

      	(*destlen)++;

      }

      static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,

                                 const void *src, Py_ssize_t len)

      {

      	if (dest) {

      		assert(*destlen + len < destsize);

      		memcpy((void *)&dest[*destlen], src, len);

      	}

      	*destlen += len;

      }

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,

      			     uint8_t c)

      {

      	static const char hexdigit[] = "0123456789abcdef";

      	charcopy(dest, destlen, destsize, hexdigit[c >> 4]);

      	charcopy(dest, destlen, destsize, hexdigit[c & 15]);

      }

      /* 3-byte escape: tilde followed by two hex digits */

      static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,

      			   char c)

      {

      	charcopy(dest, destlen, destsize, '~');

      	hexencode(dest, destlen, destsize, c);

      }

        Adrian Buehlmann
    
pathencode: new C module with fast encodedir() function...

              r17606
            
      static Py_ssize_t _encodedir(char *dest, size_t destsize,

                                   const char *src, Py_ssize_t len)

      {

      	enum dir_state state = DDEFAULT;

      	Py_ssize_t i = 0, destlen = 0;

      	while (i < len) {

      		switch (state) {

      		case DDOT:

      			switch (src[i]) {

      			case 'd':

      			case 'i':

      				state = DHGDI;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'h':

      				state = DH;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			default:

      				state = DDEFAULT;

      				break;

      			}

      			break;

      		case DH:

      			if (src[i] == 'g') {

      				state = DHGDI;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DDEFAULT;

      			break;

      		case DHGDI:

      			if (src[i] == '/') {

      				memcopy(dest, &destlen, destsize, ".hg", 3);

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			state = DDEFAULT;

      			break;

      		case DDEFAULT:

      			if (src[i] == '.')

      				state = DDOT;

      			charcopy(dest, &destlen, destsize, src[i++]);

      			break;

      		}

      	}

      	return destlen;

      }

      PyObject *encodedir(PyObject *self, PyObject *args)

      {

      	Py_ssize_t len, newlen;

      	PyObject *pathobj, *newobj;

      	char *path;

      	if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))

      		return NULL;

      	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {

      		PyErr_SetString(PyExc_TypeError, "expected a string");

      		return NULL;

      	}

      	newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;

      	if (newlen == len + 1) {

      		Py_INCREF(pathobj);

      		return pathobj;

      	}

      	newobj = PyString_FromStringAndSize(NULL, newlen);

      	if (newobj) {

      		PyString_GET_SIZE(newobj)--;

      		_encodedir(PyString_AS_STRING(newobj), newlen, path,

      			   len + 1);

      	}

      	return newobj;

      }

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],

      			  char *dest, Py_ssize_t destlen, size_t destsize,

      			  const char *src, Py_ssize_t len,

      			  int encodedir)

      {

      	enum path_state state = START;

      	Py_ssize_t i = 0;

      	/*

      	 * Python strings end with a zero byte, which we use as a

      	 * terminal token as they are not valid inside path names.

      	 */

      	while (i < len) {

      		switch (state) {

      		case START:

      			switch (src[i]) {

      			case '/':

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case '.':

      				state = LDOT;

      				escape3(dest, &destlen, destsize, src[i++]);

      				break;

      			case ' ':

      				state = DEFAULT;

      				escape3(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'a':

      				state = A;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'c':

      				state = C;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'l':

      				state = L;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'n':

      				state = N;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'p':

      				state = P;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			default:

      				state = DEFAULT;

      				break;

      			}

      			break;

      		case A:

      			if (src[i] == 'u') {

      				state = AU;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case AU:

      			if (src[i] == 'x') {

      				state = THIRD;

      				i++;

      			}

      			else state = DEFAULT;

      			break;

      		case THIRD:

      			state = DEFAULT;

      			switch (src[i]) {

      			case '.':

      			case '/':

      			case '\0':

      				escape3(dest, &destlen, destsize, src[i - 1]);

      				break;

      			default:

      				i--;

      				break;

      			}

      			break;

      		case C:

      			if (src[i] == 'o') {

      				state = CO;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case CO:

      			if (src[i] == 'm') {

      				state = COMLPT;

      				i++;

      			}

      			else if (src[i] == 'n') {

      				state = THIRD;

      				i++;

      			}

      			else state = DEFAULT;

      			break;

      		case COMLPT:

      			switch (src[i]) {

      			case '1': case '2': case '3': case '4': case '5':

      			case '6': case '7': case '8': case '9':

      				state = COMLPTn;

      				i++;

      				break;

      			default:

      				state = DEFAULT;

      				charcopy(dest, &destlen, destsize, src[i - 1]);

      				break;

      			}

      			break;

      		case COMLPTn:

      			state = DEFAULT;

      			switch (src[i]) {

      			case '.':

      			case '/':

      			case '\0':

      				escape3(dest, &destlen, destsize, src[i - 2]);

      				charcopy(dest, &destlen, destsize, src[i - 1]);

      				break;

      			default:

      				memcopy(dest, &destlen, destsize,

      					&src[i - 2], 2);

      				break;

      			}

      			break;

      		case L:

      			if (src[i] == 'p') {

      				state = LP;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case LP:

      			if (src[i] == 't') {

      				state = COMLPT;

      				i++;

      			}

      			else state = DEFAULT;

      			break;

      		case N:

      			if (src[i] == 'u') {

      				state = NU;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case NU:

      			if (src[i] == 'l') {

      				state = THIRD;

      				i++;

      			}

      			else state = DEFAULT;

      			break;

      		case P:

      			if (src[i] == 'r') {

      				state = PR;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case PR:

      			if (src[i] == 'n') {

      				state = THIRD;

      				i++;

      			}

      			else state = DEFAULT;

      			break;

      		case LDOT:

      			switch (src[i]) {

      			case 'd':

      			case 'i':

      				state = HGDI;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'h':

      				state = H;

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			default:

      				state = DEFAULT;

      				break;

      			}

      			break;

      		case DOT:

      			switch (src[i]) {

      			case '/':

      			case '\0':

      				state = START;

      				memcopy(dest, &destlen, destsize, "~2e", 3);

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'd':

      			case 'i':

      				state = HGDI;

      				charcopy(dest, &destlen, destsize, '.');

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			case 'h':

      				state = H;

      				memcopy(dest, &destlen, destsize, ".h", 2);

      				i++;

      				break;

      			default:

      				state = DEFAULT;

      				charcopy(dest, &destlen, destsize, '.');

      				break;

      			}

      			break;

      		case H:

      			if (src[i] == 'g') {

      				state = HGDI;

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case HGDI:

      			if (src[i] == '/') {

      				state = START;

      				if (encodedir)

      					memcopy(dest, &destlen, destsize, ".hg",

      						3);

      				charcopy(dest, &destlen, destsize, src[i++]);

      			}

      			else state = DEFAULT;

      			break;

      		case SPACE:

      			switch (src[i]) {

      			case '/':

      			case '\0':

      				state = START;

      				memcopy(dest, &destlen, destsize, "~20", 3);

      				charcopy(dest, &destlen, destsize, src[i++]);

      				break;

      			default:

      				state = DEFAULT;

      				charcopy(dest, &destlen, destsize, ' ');

      				break;

      			}

      			break;

      		case DEFAULT:

        André Sintzoff
    
pathencode: change isset name to avoid name collision...

              r17699
            
      			while (inset(onebyte, src[i])) {

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      				charcopy(dest, &destlen, destsize, src[i++]);

      				if (i == len)

      					goto done;

      			}

      			switch (src[i]) {

      			case '.':

      				state = DOT;

      				i++;

      				break;

      			case ' ':

      				state = SPACE;

      				i++;

      				break;

      			case '/':

      				state = START;

      				charcopy(dest, &destlen, destsize, '/');

      				i++;

      				break;

      			default:

        André Sintzoff
    
pathencode: change isset name to avoid name collision...

              r17699
            
      				if (inset(onebyte, src[i])) {

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      					do {

      						charcopy(dest, &destlen,

      							 destsize, src[i++]);

      					} while (i < len &&

        André Sintzoff
    
pathencode: change isset name to avoid name collision...

              r17699
            
      						 inset(onebyte, src[i]));

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      				}

        André Sintzoff
    
pathencode: change isset name to avoid name collision...

              r17699
            
      				else if (inset(twobytes, src[i])) {

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      					char c = src[i++];

      					charcopy(dest, &destlen, destsize, '_');

      					charcopy(dest, &destlen, destsize,

      						 c == '_' ? '_' : c + 32);

      				}

      				else

      					escape3(dest, &destlen, destsize,

      						src[i++]);

      				break;

      			}

      			break;

      		}

      	}

      done:

      	return destlen;

      }

      static Py_ssize_t basicencode(char *dest, size_t destsize,

      			      const char *src, Py_ssize_t len)

      {

      	static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };

      	static const uint32_t onebyte[8] = {

      		1, 0x2bff3bfa, 0x68000001, 0x2fffffff,

      	};

      	Py_ssize_t destlen = 0;

      	return _encode(twobytes, onebyte, dest, destlen, destsize,

        Adrian Buehlmann
    
pathencode: simplify basicencode

              r17691
            
      		       src, len, 1);

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      }

      static const Py_ssize_t maxstorepathlen = 120;

        Bryan O'Sullivan
    
store: implement lowerencode in C

              r18430
            
      static Py_ssize_t _lowerencode(char *dest, size_t destsize,

      			       const char *src, Py_ssize_t len)

      {

      	static const uint32_t onebyte[8] = {

      		1, 0x2bfffbfb, 0xe8000001, 0x2fffffff

      	};

      	static const uint32_t lower[8] = { 0, 0, 0x7fffffe };

      	Py_ssize_t i, destlen = 0;

      	for (i = 0; i < len; i++) {

      		if (inset(onebyte, src[i]))

      			charcopy(dest, &destlen, destsize, src[i]);

      		else if (inset(lower, src[i]))

      			charcopy(dest, &destlen, destsize, src[i] + 32);

      		else

      			escape3(dest, &destlen, destsize, src[i]);

      	}

      	return destlen;

      }

      PyObject *lowerencode(PyObject *self, PyObject *args)

      {

      	char *path;

      	Py_ssize_t len, newlen;

      	PyObject *ret;

      	if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len))

      		return NULL;

      	newlen = _lowerencode(NULL, 0, path, len);

      	ret = PyString_FromStringAndSize(NULL, newlen);

      	if (ret)

      		newlen = _lowerencode(PyString_AS_STRING(ret), newlen,

      				      path, len);

      	return ret;

      }

        Bryan O'Sullivan
    
pathencode: implement hashed encoding in C...

              r18433
            
      /* See store.py:_auxencode for a description. */

      static Py_ssize_t auxencode(char *dest, size_t destsize,

      			    const char *src, Py_ssize_t len)

      {

      	static const uint32_t twobytes[8];

      	static const uint32_t onebyte[8] = {

      		~0, 0xffff3ffe, ~0, ~0, ~0, ~0, ~0, ~0,

      	};

      	return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);

      }

        Bryan O'Sullivan
    
pathencode: implement the "mangling" part of hashed encoding in C...

              r18432
            
      static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])

      {

      	static const Py_ssize_t dirprefixlen = 8;

      	static const Py_ssize_t maxshortdirslen = 68;

      	char *dest;

      	PyObject *ret;

      	Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;

      	Py_ssize_t destsize, destlen = 0, slop, used;

      	while (lastslash >= 0 && src[lastslash] != '/') {

      		if (src[lastslash] == '.' && lastdot == -1)

      			lastdot = lastslash;

      		lastslash--;

      	}

      #if 0

      	/* All paths should end in a suffix of ".i" or ".d".

                 Unfortunately, the file names in test-hybridencode.py

                 violate this rule.  */

      	if (lastdot != len - 3) {

      		PyErr_SetString(PyExc_ValueError,

      				"suffix missing or wrong length");

      		return NULL;

      	}

      #endif

      	/* If src contains a suffix, we will append it to the end of

      	   the new string, so make room. */

      	destsize = 120;

      	if (lastdot >= 0)

      		destsize += len - lastdot - 1;

      	ret = PyString_FromStringAndSize(NULL, destsize);

      	if (ret == NULL)

      		return NULL;

      	dest = PyString_AS_STRING(ret);

      	memcopy(dest, &destlen, destsize, "dh/", 3);

      	/* Copy up to dirprefixlen bytes of each path component, up to

      	   a limit of maxshortdirslen bytes. */

      	for (i = d = p = 0; i < lastslash; i++, p++) {

      		if (src[i] == '/') {

      			char d = dest[destlen - 1];

      			/* After truncation, a directory name may end

      			   in a space or dot, which are unportable. */

      			if (d == '.' || d == ' ')

      				dest[destlen - 1] = '_';

      			if (destlen > maxshortdirslen)

      				break;

      			charcopy(dest, &destlen, destsize, src[i]);

      			p = -1;

      		}

      		else if (p < dirprefixlen)

      			charcopy(dest, &destlen, destsize, src[i]);

      	}

      	/* Rewind to just before the last slash copied. */

      	if (destlen > maxshortdirslen + 3)

      		do {

      			destlen--;

      		} while (destlen > 0 && dest[destlen] != '/');

      	if (destlen > 3) {

      		if (lastslash > 0) {

      			char d = dest[destlen - 1];

      			/* The last directory component may be

      			   truncated, so make it safe. */

      			if (d == '.' || d == ' ')

      				dest[destlen - 1] = '_';

      		}

      		charcopy(dest, &destlen, destsize, '/');

      	}

      	/* Add a prefix of the original file's name. Its length

      	   depends on the number of bytes left after accounting for

      	   hash and suffix. */

      	used = destlen + 40;

      	if (lastdot >= 0)

      		used += len - lastdot - 1;

      	slop = maxstorepathlen - used;

      	if (slop > 0) {

      		Py_ssize_t basenamelen =

      			lastslash >= 0 ? len - lastslash - 2 : len - 1;

      		if (basenamelen > slop)

      			basenamelen = slop;

      		if (basenamelen > 0)

      			memcopy(dest, &destlen, destsize, &src[lastslash + 1],

      				basenamelen);

      	}

      	/* Add hash and suffix. */

      	for (i = 0; i < 20; i++)

      		hexencode(dest, &destlen, destsize, sha[i]);

      	if (lastdot >= 0)

      		memcopy(dest, &destlen, destsize, &src[lastdot],

      			len - lastdot - 1);

      	PyString_GET_SIZE(ret) = destlen;

      	return ret;

      }

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      /*

        Bryan O'Sullivan
    
pathencode: add a SHA-1 hash function...

              r18431
            
       * Avoiding a trip through Python would improve performance by 50%,

       * but we don't encounter enough long names to be worth the code.

       */

      static int sha1hash(char hash[20], const char *str, Py_ssize_t len)

      {

      	static PyObject *shafunc;

      	PyObject *shaobj, *hashobj;

      	if (shafunc == NULL) {

      		PyObject *util, *name = PyString_FromString("mercurial.util");

      		if (name == NULL)

      			return -1;

      		util = PyImport_Import(name);

      		Py_DECREF(name);

      		if (util == NULL) {

      			PyErr_SetString(PyExc_ImportError, "mercurial.util");

      			return -1;

      		}

      		shafunc = PyObject_GetAttrString(util, "sha1");

      		Py_DECREF(util);

      		if (shafunc == NULL) {

      			PyErr_SetString(PyExc_AttributeError,

      					"module 'mercurial.util' has no "

      					"attribute 'sha1'");

      			return -1;

      		}

      	}

      	shaobj = PyObject_CallFunction(shafunc, "s#", str, len);

      	if (shaobj == NULL)

      		return -1;

      	hashobj = PyObject_CallMethod(shaobj, "digest", "");

      	Py_DECREF(shaobj);

      	if (!PyString_Check(hashobj) || PyString_GET_SIZE(hashobj) != 20) {

      		PyErr_SetString(PyExc_TypeError,

      				"result of digest is not a 20-byte hash");

      		Py_DECREF(hashobj);

      		return -1;

      	}

      	memcpy(hash, PyString_AS_STRING(hashobj), 20);

      	Py_DECREF(hashobj);

      	return 0;

      }

        Matt Mackall
    
pathencode: don't use alloca() for safety/portability

              r18452
            
      #define MAXENCODE 4096 * 3

        Bryan O'Sullivan
    
pathencode: implement hashed encoding in C...

              r18433
            
      static PyObject *hashencode(const char *src, Py_ssize_t len)

      {

        Matt Mackall
    
pathencode: don't use alloca() for safety/portability

              r18452
            
      	char dired[MAXENCODE];

      	char lowered[MAXENCODE];

      	char auxed[MAXENCODE];

      	Py_ssize_t dirlen, lowerlen, auxlen, baselen;

        Bryan O'Sullivan
    
pathencode: implement hashed encoding in C...

              r18433
            
      	char sha[20];

        Matt Mackall
    
pathencode: don't use alloca() for safety/portability

              r18452
            
      	baselen = (len - 5) * 3;

      	if (baselen >= MAXENCODE) {

      		PyErr_SetString(PyExc_ValueError, "string too long");

      		return NULL;

      	}

        Bryan O'Sullivan
    
pathencode: implement hashed encoding in C...

              r18433
            
      	dirlen = _encodedir(dired, baselen, src, len);

      	if (sha1hash(sha, dired, dirlen - 1) == -1)

      		return NULL;

      	lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);

      	auxlen = auxencode(auxed, baselen, lowered, lowerlen);

      	return hashmangle(auxed, auxlen, sha);

      }

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      PyObject *pathencode(PyObject *self, PyObject *args)

      {

      	Py_ssize_t len, newlen;

      	PyObject *pathobj, *newobj;

      	char *path;

      	if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))

      		return NULL;

      	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {

      		PyErr_SetString(PyExc_TypeError, "expected a string");

      		return NULL;

      	}

        Bryan O'Sullivan
    
pathencode: implement both basic and hashed encoding in C

              r18434
            
      	if (len > maxstorepathlen)

      		newlen = maxstorepathlen + 2;

      	else

      		newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      	if (newlen <= maxstorepathlen + 1) {

      		if (newlen == len + 1) {

      			Py_INCREF(pathobj);

      			return pathobj;

      		}

      		newobj = PyString_FromStringAndSize(NULL, newlen);

      		if (newobj) {

      			PyString_GET_SIZE(newobj)--;

      			basicencode(PyString_AS_STRING(newobj), newlen, path,

      				    len + 1);

      		}

      	}

        Bryan O'Sullivan
    
pathencode: implement both basic and hashed encoding in C

              r18434
            
      	else

      		newobj = hashencode(path, len + 1);

        Bryan O'Sullivan
    
store: implement fncache basic path encoding in C...

              r17616
            
      	return newobj;

      }

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

Adrian Buehlmann pathencode: new C module with fast encodedir() function...	r17606	/*
		pathencode.c - efficient path name encoding

		Copyright 2012 Facebook

		This software may be used and distributed according to the terms of
		the GNU General Public License, incorporated herein by reference.
		*/

Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	/*
		* An implementation of the name encoding scheme used by the fncache
		* store. The common case is of a path < 120 bytes long, which is
		* handled either in a single pass with no allocations or two passes
		* with a single allocation. For longer paths, multiple passes are
		* required.
		*/

Bryan O'Sullivan store: implement lowerencode in C	r18430	#define PY_SSIZE_T_CLEAN
Adrian Buehlmann pathencode: new C module with fast encodedir() function...	r17606	#include <Python.h>
		#include <assert.h>
		#include <ctype.h>
		#include <stdlib.h>
		#include <string.h>

		#include "util.h"

Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	/* state machine for the fast path */
		enum path_state {
		START, /* first byte of a path component */
		A, /* "AUX" */
		AU,
		THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
		C, /* "CON" or "COMn" */
		CO,
		COMLPT, /* "COM" or "LPT" */
		COMLPTn,
		L,
		LP,
		N,
		NU,
		P, /* "PRN" */
		PR,
		LDOT, /* leading '.' */
		DOT, /* '.' in a non-leading position */
		H, /* ".h" */
		HGDI, /* ".hg", ".d", or ".i" */
		SPACE,
		DEFAULT, /* byte of a path component after the first */
		};

Adrian Buehlmann pathencode: new C module with fast encodedir() function...	r17606	/* state machine for dir-encoding */
		enum dir_state {
		DDOT,
		DH,
		DHGDI,
		DDEFAULT,
		};

André Sintzoff pathencode: change isset name to avoid name collision...	r17699	static inline int inset(const uint32_t bitset[], char c)
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	{
		return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
		}

Adrian Buehlmann pathencode: new C module with fast encodedir() function...	r17606	static inline void charcopy(char dest, Py_ssize_t destlen, size_t destsize,
		char c)
		{
		if (dest) {
		assert(*destlen < destsize);
		dest[*destlen] = c;
		}
		(*destlen)++;
		}

		static inline void memcopy(char dest, Py_ssize_t destlen, size_t destsize,
		const void *src, Py_ssize_t len)
		{
		if (dest) {
		assert(*destlen + len < destsize);
		memcpy((void )&dest[destlen], src, len);
		}
		*destlen += len;
		}

Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	static inline void hexencode(char dest, Py_ssize_t destlen, size_t destsize,
		uint8_t c)
		{
		static const char hexdigit[] = "0123456789abcdef";

		charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
		charcopy(dest, destlen, destsize, hexdigit[c & 15]);
		}

		/* 3-byte escape: tilde followed by two hex digits */
		static inline void escape3(char dest, Py_ssize_t destlen, size_t destsize,
		char c)
		{
		charcopy(dest, destlen, destsize, '~');
		hexencode(dest, destlen, destsize, c);
		}

Adrian Buehlmann pathencode: new C module with fast encodedir() function...	r17606	static Py_ssize_t _encodedir(char *dest, size_t destsize,
		const char *src, Py_ssize_t len)
		{
		enum dir_state state = DDEFAULT;
		Py_ssize_t i = 0, destlen = 0;

		while (i < len) {
		switch (state) {
		case DDOT:
		switch (src[i]) {
		case 'd':
		case 'i':
		state = DHGDI;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'h':
		state = DH;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		default:
		state = DDEFAULT;
		break;
		}
		break;
		case DH:
		if (src[i] == 'g') {
		state = DHGDI;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DDEFAULT;
		break;
		case DHGDI:
		if (src[i] == '/') {
		memcopy(dest, &destlen, destsize, ".hg", 3);
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		state = DDEFAULT;
		break;
		case DDEFAULT:
		if (src[i] == '.')
		state = DDOT;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		}
		}

		return destlen;
		}

		PyObject encodedir(PyObject self, PyObject *args)
		{
		Py_ssize_t len, newlen;
		PyObject pathobj, newobj;
		char *path;

		if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
		return NULL;

		if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
		PyErr_SetString(PyExc_TypeError, "expected a string");
		return NULL;
		}

		newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;

		if (newlen == len + 1) {
		Py_INCREF(pathobj);
		return pathobj;
		}

		newobj = PyString_FromStringAndSize(NULL, newlen);

		if (newobj) {
		PyString_GET_SIZE(newobj)--;
		_encodedir(PyString_AS_STRING(newobj), newlen, path,
		len + 1);
		}

		return newobj;
		}
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616
		static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
		char *dest, Py_ssize_t destlen, size_t destsize,
		const char *src, Py_ssize_t len,
		int encodedir)
		{
		enum path_state state = START;
		Py_ssize_t i = 0;

		/*
		* Python strings end with a zero byte, which we use as a
		* terminal token as they are not valid inside path names.
		*/

		while (i < len) {
		switch (state) {
		case START:
		switch (src[i]) {
		case '/':
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case '.':
		state = LDOT;
		escape3(dest, &destlen, destsize, src[i++]);
		break;
		case ' ':
		state = DEFAULT;
		escape3(dest, &destlen, destsize, src[i++]);
		break;
		case 'a':
		state = A;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'c':
		state = C;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'l':
		state = L;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'n':
		state = N;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'p':
		state = P;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		default:
		state = DEFAULT;
		break;
		}
		break;
		case A:
		if (src[i] == 'u') {
		state = AU;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case AU:
		if (src[i] == 'x') {
		state = THIRD;
		i++;
		}
		else state = DEFAULT;
		break;
		case THIRD:
		state = DEFAULT;
		switch (src[i]) {
		case '.':
		case '/':
		case '\0':
		escape3(dest, &destlen, destsize, src[i - 1]);
		break;
		default:
		i--;
		break;
		}
		break;
		case C:
		if (src[i] == 'o') {
		state = CO;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case CO:
		if (src[i] == 'm') {
		state = COMLPT;
		i++;
		}
		else if (src[i] == 'n') {
		state = THIRD;
		i++;
		}
		else state = DEFAULT;
		break;
		case COMLPT:
		switch (src[i]) {
		case '1': case '2': case '3': case '4': case '5':
		case '6': case '7': case '8': case '9':
		state = COMLPTn;
		i++;
		break;
		default:
		state = DEFAULT;
		charcopy(dest, &destlen, destsize, src[i - 1]);
		break;
		}
		break;
		case COMLPTn:
		state = DEFAULT;
		switch (src[i]) {
		case '.':
		case '/':
		case '\0':
		escape3(dest, &destlen, destsize, src[i - 2]);
		charcopy(dest, &destlen, destsize, src[i - 1]);
		break;
		default:
		memcopy(dest, &destlen, destsize,
		&src[i - 2], 2);
		break;
		}
		break;
		case L:
		if (src[i] == 'p') {
		state = LP;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case LP:
		if (src[i] == 't') {
		state = COMLPT;
		i++;
		}
		else state = DEFAULT;
		break;
		case N:
		if (src[i] == 'u') {
		state = NU;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case NU:
		if (src[i] == 'l') {
		state = THIRD;
		i++;
		}
		else state = DEFAULT;
		break;
		case P:
		if (src[i] == 'r') {
		state = PR;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case PR:
		if (src[i] == 'n') {
		state = THIRD;
		i++;
		}
		else state = DEFAULT;
		break;
		case LDOT:
		switch (src[i]) {
		case 'd':
		case 'i':
		state = HGDI;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'h':
		state = H;
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		default:
		state = DEFAULT;
		break;
		}
		break;
		case DOT:
		switch (src[i]) {
		case '/':
		case '\0':
		state = START;
		memcopy(dest, &destlen, destsize, "~2e", 3);
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'd':
		case 'i':
		state = HGDI;
		charcopy(dest, &destlen, destsize, '.');
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		case 'h':
		state = H;
		memcopy(dest, &destlen, destsize, ".h", 2);
		i++;
		break;
		default:
		state = DEFAULT;
		charcopy(dest, &destlen, destsize, '.');
		break;
		}
		break;
		case H:
		if (src[i] == 'g') {
		state = HGDI;
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case HGDI:
		if (src[i] == '/') {
		state = START;
		if (encodedir)
		memcopy(dest, &destlen, destsize, ".hg",
		3);
		charcopy(dest, &destlen, destsize, src[i++]);
		}
		else state = DEFAULT;
		break;
		case SPACE:
		switch (src[i]) {
		case '/':
		case '\0':
		state = START;
		memcopy(dest, &destlen, destsize, "~20", 3);
		charcopy(dest, &destlen, destsize, src[i++]);
		break;
		default:
		state = DEFAULT;
		charcopy(dest, &destlen, destsize, ' ');
		break;
		}
		break;
		case DEFAULT:
André Sintzoff pathencode: change isset name to avoid name collision...	r17699	while (inset(onebyte, src[i])) {
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	charcopy(dest, &destlen, destsize, src[i++]);
		if (i == len)
		goto done;
		}
		switch (src[i]) {
		case '.':
		state = DOT;
		i++;
		break;
		case ' ':
		state = SPACE;
		i++;
		break;
		case '/':
		state = START;
		charcopy(dest, &destlen, destsize, '/');
		i++;
		break;
		default:
André Sintzoff pathencode: change isset name to avoid name collision...	r17699	if (inset(onebyte, src[i])) {
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	do {
		charcopy(dest, &destlen,
		destsize, src[i++]);
		} while (i < len &&
André Sintzoff pathencode: change isset name to avoid name collision...	r17699	inset(onebyte, src[i]));
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	}
André Sintzoff pathencode: change isset name to avoid name collision...	r17699	else if (inset(twobytes, src[i])) {
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	char c = src[i++];
		charcopy(dest, &destlen, destsize, '_');
		charcopy(dest, &destlen, destsize,
		c == '_' ? '_' : c + 32);
		}
		else
		escape3(dest, &destlen, destsize,
		src[i++]);
		break;
		}
		break;
		}
		}
		done:
		return destlen;
		}

		static Py_ssize_t basicencode(char *dest, size_t destsize,
		const char *src, Py_ssize_t len)
		{
		static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };

		static const uint32_t onebyte[8] = {
		1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
		};

		Py_ssize_t destlen = 0;

		return _encode(twobytes, onebyte, dest, destlen, destsize,
Adrian Buehlmann pathencode: simplify basicencode	r17691	src, len, 1);
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	}

		static const Py_ssize_t maxstorepathlen = 120;

Bryan O'Sullivan store: implement lowerencode in C	r18430	static Py_ssize_t _lowerencode(char *dest, size_t destsize,
		const char *src, Py_ssize_t len)
		{
		static const uint32_t onebyte[8] = {
		1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
		};

		static const uint32_t lower[8] = { 0, 0, 0x7fffffe };

		Py_ssize_t i, destlen = 0;

		for (i = 0; i < len; i++) {
		if (inset(onebyte, src[i]))
		charcopy(dest, &destlen, destsize, src[i]);
		else if (inset(lower, src[i]))
		charcopy(dest, &destlen, destsize, src[i] + 32);
		else
		escape3(dest, &destlen, destsize, src[i]);
		}

		return destlen;
		}

		PyObject lowerencode(PyObject self, PyObject *args)
		{
		char *path;
		Py_ssize_t len, newlen;
		PyObject *ret;

		if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len))
		return NULL;

		newlen = _lowerencode(NULL, 0, path, len);
		ret = PyString_FromStringAndSize(NULL, newlen);
		if (ret)
		newlen = _lowerencode(PyString_AS_STRING(ret), newlen,
		path, len);

		return ret;
		}

Bryan O'Sullivan pathencode: implement hashed encoding in C...	r18433	/* See store.py:_auxencode for a description. */
		static Py_ssize_t auxencode(char *dest, size_t destsize,
		const char *src, Py_ssize_t len)
		{
		static const uint32_t twobytes[8];

		static const uint32_t onebyte[8] = {
		~0, 0xffff3ffe, ~0, ~0, ~0, ~0, ~0, ~0,
		};

		return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
		}

Bryan O'Sullivan pathencode: implement the "mangling" part of hashed encoding in C...	r18432	static PyObject hashmangle(const char src, Py_ssize_t len, const char sha[20])
		{
		static const Py_ssize_t dirprefixlen = 8;
		static const Py_ssize_t maxshortdirslen = 68;
		char *dest;
		PyObject *ret;

		Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
		Py_ssize_t destsize, destlen = 0, slop, used;

		while (lastslash >= 0 && src[lastslash] != '/') {
		if (src[lastslash] == '.' && lastdot == -1)
		lastdot = lastslash;
		lastslash--;
		}

		#if 0
		/* All paths should end in a suffix of ".i" or ".d".
		Unfortunately, the file names in test-hybridencode.py
		violate this rule. */
		if (lastdot != len - 3) {
		PyErr_SetString(PyExc_ValueError,
		"suffix missing or wrong length");
		return NULL;
		}
		#endif

		/* If src contains a suffix, we will append it to the end of
		the new string, so make room. */
		destsize = 120;
		if (lastdot >= 0)
		destsize += len - lastdot - 1;

		ret = PyString_FromStringAndSize(NULL, destsize);
		if (ret == NULL)
		return NULL;

		dest = PyString_AS_STRING(ret);
		memcopy(dest, &destlen, destsize, "dh/", 3);

		/* Copy up to dirprefixlen bytes of each path component, up to
		a limit of maxshortdirslen bytes. */
		for (i = d = p = 0; i < lastslash; i++, p++) {
		if (src[i] == '/') {
		char d = dest[destlen - 1];
		/* After truncation, a directory name may end
		in a space or dot, which are unportable. */
		if (d == '.' \|\| d == ' ')
		dest[destlen - 1] = '_';
		if (destlen > maxshortdirslen)
		break;
		charcopy(dest, &destlen, destsize, src[i]);
		p = -1;
		}
		else if (p < dirprefixlen)
		charcopy(dest, &destlen, destsize, src[i]);
		}

		/* Rewind to just before the last slash copied. */
		if (destlen > maxshortdirslen + 3)
		do {
		destlen--;
		} while (destlen > 0 && dest[destlen] != '/');

		if (destlen > 3) {
		if (lastslash > 0) {
		char d = dest[destlen - 1];
		/* The last directory component may be
		truncated, so make it safe. */
		if (d == '.' \|\| d == ' ')
		dest[destlen - 1] = '_';
		}

		charcopy(dest, &destlen, destsize, '/');
		}

		/* Add a prefix of the original file's name. Its length
		depends on the number of bytes left after accounting for
		hash and suffix. */
		used = destlen + 40;
		if (lastdot >= 0)
		used += len - lastdot - 1;
		slop = maxstorepathlen - used;
		if (slop > 0) {
		Py_ssize_t basenamelen =
		lastslash >= 0 ? len - lastslash - 2 : len - 1;

		if (basenamelen > slop)
		basenamelen = slop;
		if (basenamelen > 0)
		memcopy(dest, &destlen, destsize, &src[lastslash + 1],
		basenamelen);
		}

		/* Add hash and suffix. */
		for (i = 0; i < 20; i++)
		hexencode(dest, &destlen, destsize, sha[i]);

		if (lastdot >= 0)
		memcopy(dest, &destlen, destsize, &src[lastdot],
		len - lastdot - 1);

		PyString_GET_SIZE(ret) = destlen;

		return ret;
		}

Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	/*
Bryan O'Sullivan pathencode: add a SHA-1 hash function...	r18431	* Avoiding a trip through Python would improve performance by 50%,
		* but we don't encounter enough long names to be worth the code.
		*/
		static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
		{
		static PyObject *shafunc;
		PyObject shaobj, hashobj;

		if (shafunc == NULL) {
		PyObject util, name = PyString_FromString("mercurial.util");

		if (name == NULL)
		return -1;

		util = PyImport_Import(name);
		Py_DECREF(name);

		if (util == NULL) {
		PyErr_SetString(PyExc_ImportError, "mercurial.util");
		return -1;
		}
		shafunc = PyObject_GetAttrString(util, "sha1");
		Py_DECREF(util);

		if (shafunc == NULL) {
		PyErr_SetString(PyExc_AttributeError,
		"module 'mercurial.util' has no "
		"attribute 'sha1'");
		return -1;
		}
		}

		shaobj = PyObject_CallFunction(shafunc, "s#", str, len);

		if (shaobj == NULL)
		return -1;

		hashobj = PyObject_CallMethod(shaobj, "digest", "");
		Py_DECREF(shaobj);

		if (!PyString_Check(hashobj) \|\| PyString_GET_SIZE(hashobj) != 20) {
		PyErr_SetString(PyExc_TypeError,
		"result of digest is not a 20-byte hash");
		Py_DECREF(hashobj);
		return -1;
		}

		memcpy(hash, PyString_AS_STRING(hashobj), 20);
		Py_DECREF(hashobj);
		return 0;
		}

Matt Mackall pathencode: don't use alloca() for safety/portability	r18452	#define MAXENCODE 4096 * 3

Bryan O'Sullivan pathencode: implement hashed encoding in C...	r18433	static PyObject hashencode(const char src, Py_ssize_t len)
		{
Matt Mackall pathencode: don't use alloca() for safety/portability	r18452	char dired[MAXENCODE];
		char lowered[MAXENCODE];
		char auxed[MAXENCODE];
		Py_ssize_t dirlen, lowerlen, auxlen, baselen;
Bryan O'Sullivan pathencode: implement hashed encoding in C...	r18433	char sha[20];

Matt Mackall pathencode: don't use alloca() for safety/portability	r18452	baselen = (len - 5) * 3;
		if (baselen >= MAXENCODE) {
		PyErr_SetString(PyExc_ValueError, "string too long");
		return NULL;
		}

Bryan O'Sullivan pathencode: implement hashed encoding in C...	r18433	dirlen = _encodedir(dired, baselen, src, len);
		if (sha1hash(sha, dired, dirlen - 1) == -1)
		return NULL;
		lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
		auxlen = auxencode(auxed, baselen, lowered, lowerlen);
		return hashmangle(auxed, auxlen, sha);
		}

Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616	PyObject pathencode(PyObject self, PyObject *args)
		{
		Py_ssize_t len, newlen;
		PyObject pathobj, newobj;
		char *path;

		if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
		return NULL;

		if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
		PyErr_SetString(PyExc_TypeError, "expected a string");
		return NULL;
		}

Bryan O'Sullivan pathencode: implement both basic and hashed encoding in C	r18434	if (len > maxstorepathlen)
		newlen = maxstorepathlen + 2;
		else
		newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616
		if (newlen <= maxstorepathlen + 1) {
		if (newlen == len + 1) {
		Py_INCREF(pathobj);
		return pathobj;
		}

		newobj = PyString_FromStringAndSize(NULL, newlen);

		if (newobj) {
		PyString_GET_SIZE(newobj)--;
		basicencode(PyString_AS_STRING(newobj), newlen, path,
		len + 1);
		}
		}
Bryan O'Sullivan pathencode: implement both basic and hashed encoding in C	r18434	else
		newobj = hashencode(path, len + 1);
Bryan O'Sullivan store: implement fncache basic path encoding in C...	r17616
		return newobj;
		}