5. Python3源码—字符串(st

发布时间:2019-10-12 20:09:00编辑:auto阅读(2086)

    5.1. 字符串对象

    字符串对象是“变长对象”。

    5.1.1. Python中的创建

    Python中字符串(strs)对象最重要的创建方法为PyUnicode_DecodeUTF8Stateful,如下Python语句最终会调用到PyUnicode_DecodeUTF8Stateful:

    a = 'hello
    b = str('world')

    5.1.2. PyUnicode_DecodeUTF8Stateful的C调用栈

    词法解析,最终调到PyUnicode_DecodeUTF8Stateful,调用顺序如下:

    // ast.c
    ast_for_expr
    =>ast_for_power
    =>ast_for_atom_expr
    =>ast_for_atom (case STRING)
    =>parsestrplus
    =>parsestr
    
    // unicodeobject.c
    => PyUnicode_DecodeUTF8Stateful

    5.1.3. PyUnicode_DecodeUTF8Stateful源码

    // unicodeobject.c
    PyObject *
    PyUnicode_DecodeUTF8Stateful(const char *s,
                                 Py_ssize_t size,
                                 const char *errors,
                                 Py_ssize_t *consumed)
    {
        _PyUnicodeWriter writer;
        const char *starts = s;
        const char *end = s + size;
    
        Py_ssize_t startinpos;
        Py_ssize_t endinpos;
        const char *errmsg = "";
        PyObject *error_handler_obj = NULL;
        PyObject *exc = NULL;
        _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
    
        if (size == 0) {
            if (consumed)
                *consumed = 0;
            _Py_RETURN_UNICODE_EMPTY();
        }
    
        /* ASCII is equivalent to the first 128 ordinals in Unicode. */
        if (size == 1 && (unsigned char)s[0] < 128) {
            if (consumed)
                *consumed = 1;
            return get_latin1_char((unsigned char)s[0]);
        }
    
        _PyUnicodeWriter_Init(&writer);
        writer.min_length = size;
        if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
            goto onError;
    
        writer.pos = ascii_decode(s, end, writer.data);
        s += writer.pos;
        while (s < end) {
            // ascii解码后的size小于传入的size
        }
    
    End:
        if (consumed)
            *consumed = s - starts;
    
        Py_XDECREF(error_handler_obj);
        Py_XDECREF(exc);
        return _PyUnicodeWriter_Finish(&writer);
    
    onError:
        Py_XDECREF(error_handler_obj);
        Py_XDECREF(exc);
        _PyUnicodeWriter_Dealloc(&writer);
        return NULL;
    }

    可以看到:

    • 空串缓存:空串(unicode_empty)为同一个地址,第二次需要空串时,只是将计数加1,在_PyUnicodeWriter_Finish中实现空串缓存。
    // unicodeobject.c
    static PyObject *unicode_empty = NULL;
    
    #define _Py_INCREF_UNICODE_EMPTY()                      \
        do {                                                \
            if (unicode_empty != NULL)                      \
                Py_INCREF(unicode_empty);                   \
            else {                                          \
                unicode_empty = PyUnicode_New(0, 0);        \
                if (unicode_empty != NULL) {                \
                    Py_INCREF(unicode_empty);               \
                    assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
                }                                           \
            }                                               \
        } while (0)
    
    #define _Py_RETURN_UNICODE_EMPTY()                      \
        do {                                                \
            _Py_INCREF_UNICODE_EMPTY();                     \
            return unicode_empty;                           \
        } while (0)
    
    // PyUnicode_DecodeUTF8Stateful->
    // _PyUnicodeWriter_Finish->
    // unicode_result_ready
    static PyObject*
    unicode_result_ready(PyObject *unicode)
    {
        Py_ssize_t length;
    
        length = PyUnicode_GET_LENGTH(unicode);
        if (length == 0) {
            if (unicode != unicode_empty) {
                Py_DECREF(unicode);
                _Py_RETURN_UNICODE_EMPTY();
            }
            return unicode_empty;
        }
    
        if (length == 1) {
            void *data = PyUnicode_DATA(unicode);
            int kind = PyUnicode_KIND(unicode);
            Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
            if (ch < 256) {
                PyObject *latin1_char = unicode_latin1[ch];
                if (latin1_char != NULL) {
                    if (unicode != latin1_char) {
                        Py_INCREF(latin1_char);
                        Py_DECREF(unicode);
                    }
                    return latin1_char;
                }
                else {
                    assert(_PyUnicode_CheckConsistency(unicode, 1));
                    Py_INCREF(unicode);
                    unicode_latin1[ch] = unicode;
                    return unicode;
                }
            }
        }
    
        assert(_PyUnicode_CheckConsistency(unicode, 1));
        return unicode;
    }
    • 字符缓冲池:字符(unicode_latin1)为同一个地址,第二次需要该字符时,只是将计数加1,在get_latin1_char中实现字符缓存。
    // unicodeobject.c
    static PyObject *unicode_latin1[256] = {NULL};
    
    PyObject *
    PyUnicode_DecodeUTF8Stateful(const char *s,
                                 Py_ssize_t size,
                                 const char *errors,
                                 Py_ssize_t *consumed)
    {
          // do sth.
    
        /* ASCII is equivalent to the first 128 ordinals in Unicode. */
        if (size == 1 && (unsigned char)s[0] < 128) {
            if (consumed)
                *consumed = 1;
            return get_latin1_char((unsigned char)s[0]);
        }
    
          // do sth.
    }
    
    static PyObject*
    get_latin1_char(unsigned char ch)
    {
        PyObject *unicode = unicode_latin1[ch];
        if (!unicode) {
            unicode = PyUnicode_New(1, ch);
            if (!unicode)
                return NULL;
            PyUnicode_1BYTE_DATA(unicode)[0] = ch;
            assert(_PyUnicode_CheckConsistency(unicode, 1));
            unicode_latin1[ch] = unicode;
        }
        Py_INCREF(unicode);
        return unicode;
    }

    5.2. 常量字符串池

    a = 'hello'
    b = 'hello'
    a is b  #True

    由上例可以看出Python对常量字符串做了缓存。缓存的关键性实现在PyUnicode_InternInPlace方法中。

    5.2.1. PyUnicode_InternInPlace的C调用堆栈

    // compile.c
    assemble
    =>makecode
    // codeobject.c
    =>PyCode_New
    =>intern_string_constants
    // unicodeobject.c
    =>PyUnicode_InternInPlace

    5.2.2. PyUnicode_InternInPlace源码

    // unicodeobject.c
    static PyObject *interned = NULL;
    
    void
    PyUnicode_InternInPlace(PyObject **p)
    {
        PyObject *s = *p;
        PyObject *t;
    #ifdef Py_DEBUG
        assert(s != NULL);
        assert(_PyUnicode_CHECK(s));
    #else
        if (s == NULL || !PyUnicode_Check(s))
            return;
    #endif
        /* If it's a subclass, we don't really know what putting
           it in the interned dict might do. */
        if (!PyUnicode_CheckExact(s))
            return;
        if (PyUnicode_CHECK_INTERNED(s))
            return;
        if (interned == NULL) {
            interned = PyDict_New();
            if (interned == NULL) {
                PyErr_Clear(); /* Don't leave an exception */
                return;
            }
        }
        Py_ALLOW_RECURSION
        t = PyDict_SetDefault(interned, s, s);
        Py_END_ALLOW_RECURSION
        if (t == NULL) {
            PyErr_Clear();
            return;
        }
        if (t != s) {
            Py_INCREF(t);
            Py_SETREF(*p, t);
            return;
        }
        /* The two references in interned are not counted by refcnt.
           The deallocator will take care of this */
        Py_REFCNT(s) -= 2;
        _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
    }

    其中最关键的方法为PyDict_SetDefault,该方法存在于字典对象dictobject.c中。如果没有相同的key(此处为s),则返回defaultobject(此处也为s),否则如果有相同的key则返回对应的value。所以如果t与s不同,则说明字典中有相应的key,此时将t的计数加1,并且将之前常量字符串的对象指向t。

    如此一来,常量字符串的对象地址就一致了,此时s的计数会被消除,如果s的计数为0,则会被释放。值得注意的是,常量字符串的对象每次仍旧会被多分配一次内存,只是如果之前有分配过,且如果此次分配的对象计数为0,则会被释放。

    有些情况下(字符串包含非0-9a-zA-Z)不会放到字典里,这时候可以通过sys.intern进行性能优化:

    import sys
    a = '啊'
    b = '啊'
    a is b    # False
    
    a = sys.intern('啊')
    b = sys.intern('啊')
    a is b    # True

    具体可以参考:memory - What does python sys.intern do, and when should it be used? - Stack Overflow

    5.3. 字符串对象的特性

    支持tp_as_number、tp_as_sequence、tp_as_mapping这三种操作。

    5.3.1. 数值操作

    // unicodeobject.c
    &unicode_as_number,                         /* tp_as_number */

    5.3.2. 序列操作

    // unicodeobject.c
    &unicode_as_sequence,                     /* tp_as_sequence */
    // unicodeobject.c
    static PySequenceMethods unicode_as_sequence = {
        (lenfunc) unicode_length,       /* sq_length */
        PyUnicode_Concat,           /* sq_concat */
        (ssizeargfunc) unicode_repeat,  /* sq_repeat */
        (ssizeargfunc) unicode_getitem,     /* sq_item */
        0,                  /* sq_slice */
        0,                  /* sq_ass_item */
        0,                  /* sq_ass_slice */
        PyUnicode_Contains,         /* sq_contains */
    };

    因为没有实现PySequenceMethods中的设置方法,所以字符串不可变。

    其中:

    • unicode_length
    len('hello')
    • PyUnicode_Concat
    'hello' + 'wolrd'

    多个字符串相加效率低于join,join只分配一次内存;

    • unicode_repeat
    'hello'*10

    效率要高于同个字符串相加;

    • unicode_getitem:暂时没有找到相应Python语句;
    • PyUnicode_Contains
    'h' in 'hello'

    5.3.3. 关联操作

    // unicodeobject.c
    &unicode_as_mapping,                        /* tp_as_mapping */
    // unicodeobject.c
    static PyMappingMethods unicode_as_mapping = {
        (lenfunc)unicode_length,        /* mp_length */
        (binaryfunc)unicode_subscript,  /* mp_subscript */
        (objobjargproc)0,           /* mp_ass_subscript */
    };

    其中:

    • unicode_subscript
    test = 'hello world'
    test[1]
    test[0:5]

    test[1]会走unicode_subscript方法的index分支,test[0:5]会走slice分支;

    5.3.4. to string

    // unicodeobject.c
    unicode_repr,                                   /* tp_repr */
    (reprfunc) unicode_str,                         /* tp_str */

    5.3.5. hash

    // unicodeobject.c
    (hashfunc) unicode_hash,                        /* tp_hash*/

    5.3.6. 比较

    // unicodeobject.c
    PyUnicode_RichCompare,                      /* tp_richcompare */

    5.3.7. 内置方法

    // unicodeobject.c
    unicode_methods,                              /* tp_methods */

    5.4 参考

    • Python源码剖析

    本文作者:whj0709
    阅读原文
    本文为云栖社区原创内容,未经允许不得转载。

关键字