示例

#include <cstdio>
#include <string>

int main()
{
    printf("%lu\n", sizeof(std::string));
    
    std::string s1 = "a";
    std::string s2 = s1;
    printf("%p\n%p", s1.c_str(), s2.c_str());
}

GCC 4.8.5 输出:

8
0x109cd88
0x109cd88

GCC 5.1.0 输出:

32
0x7ffe01afc030
0x7ffe01afc010

内存分配:

#include <iostream>
#include <cstdlib>
#include <string>

// rewrite operator new to print log
void* operator new(std::size_t n)
{
    //std::cout << "allocate " << n << " bytes" << std::endl;
    return malloc(n);
}

void operator delete(void *p) throw()
{
    //std::cout << "delete " << p << std::endl;
    free(p);
}

void print(const std::string& scene, const std::string& s)
{
    std::cout << "[" << scene << "] sizeof: " << sizeof(s) << " size: " << s.size() << " capacity: " << s.capacity() << std::endl;
}

int main()
{
    std::string s("hello");
    print("init", s);
    
    for (size_t i = 0; i != 24; ++i) {
        s.push_back('+');
        //std::cout << i << ":" << s << std::endl;
    }
    print("after push_back", s);
}

实现

  • 实现标准:ISO C++ 14882: 21 Strings library

  • GCC 4.8.5std::string通过Reference-counted COW string implentation方式实现,采用了写时复制的引用计数(_Rep类),内部仅有一个实际持有的数据指针_M_dataplus,因此sizeof(std::string) == 8,而且由COW实现也可见多线程下使用不安全。

  • 自从GCC 5.1开始,std::string引入了遵从C++11标准的新实现,默认使用SSO(small string optimization)特性,禁用了写时复制(COW)引用计数机制,这也带来了与旧版本std::stringABI兼容性问题。GCC 5.1.std::string中使用了三个私有成员:实际持有的数据指针_M_dataplus、数据实际大小_M_string_length、针对短小字符串优化的默认16字节大小的匿名union对应的栈上内存,因此sizeof(std::string) == 32,而且已不再采用引用计数机制,每个对象真正持有实际数据,线程安全。

  • 对于新的std::string实现导致的不同版本GCC编译导致的二进制不兼容问题,GCC5 提供了相应的二进制兼容宏开关_GLIBCXX_USE_CXX11_ABI。若_GLIBCXX_USE_CXX11_ABI=0,则链接旧版本;若_GLIBCXX_USE_CXX11_ABI=1,则链接新版本。

// GCC 4.8.5
template<typename _CharT, typename _Traits = char_traits<_CharT>,
         typename _Alloc = allocator<_CharT> >
class basic_string;
typedef basic_string<char> string;
 
// refer:
// gcc-releases-gcc-4.8.5/libstdc++-v3/include/bits/basic_string.h

template<typename _CharT, typename _Traits, typename _Alloc>
class basic_string
{
 private:
  struct _Rep_base
  {
    size_type _M_length;
    size_type _M_capacity;
    _Atomic_word _M_refcount;
  };
 
  struct _Rep : _Rep_base
  {
  };
  // Use empty-base optimization: http://www.cantrip.org/emptyopt.html
  struct _Alloc_hider : _Alloc
  {
    _Alloc_hider(_CharT* __dat, const _Alloc& __a)
        : _Alloc(__a), _M_p(__dat) { }

    _CharT* _M_p; // The actual data.
  };
 
  // Data Members (private)
  mutable _Alloc_hider _M_dataplus;
  
  _CharT* _M_data() const
  { return _M_dataplus._M_p; }
 
  _Rep* _M_rep() const
  { return &((reinterpret_cast<_Rep*> (_M_data()))[-1]); }
};
// GCC 5.1
template<typename _CharT, typename _Traits, typename _Alloc>
class basic_string
{
 private:
  // Use empty-base optimization: http://www.cantrip.org/emptyopt.html
  struct _Alloc_hider : allocator_type // TODO check __is_final
  {
    _Alloc_hider(pointer __dat, const _Alloc& __a = _Alloc())
        : allocator_type(__a), _M_p(__dat) { }
 
    pointer _M_p; // The actual data.
  };
 
  _Alloc_hider	_M_dataplus;
  size_type     _M_string_length;
 
  enum { _S_local_capacity = 15 / sizeof(_CharT) };
 
  union
  {
      _CharT           _M_local_buf[_S_local_capacity + 1];
      size_type        _M_allocated_capacity;
  };
};

libstdc++ vs libc++

libc++ is not binary compatible with gcc’s libstdc++ (except for some low level stuff such as operator new). For example the std::string in gcc’s libstdc++ is refcounted, whereas in libc++ it uses the “short string optimization”. If you were to accidentally mix these two strings in the same program (and mistake them for the same data structure), you would inevitably get a run time crash.

refer:

Refer