C/C++ – Page 3 – 0x2B|~0x2B

Sharing Mutex and Condition Variable Between Processes

Posted on April 10, 2014 by gonwan — 2 Comments ↓

As title, the key is to set an attribute(PTHREAD_PROCESS_SHARED) to the mutex/condition variable using pthread_mutexattr_setpshared() or pthread_condattr_setpshared(). Without these function calls, the parent in the following code will not get signaled forever.

/*
 * gcc mutex.c -o mutex -lrt
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>

#define MYMUTEX "/mymutex"
#define MYCOND  "/mycond"

int main(int argc, char* argv[])
{
    pthread_cond_t *cond;
    pthread_mutex_t *mutex;
    int cond_id, mutex_id;
    int mode = S_IRWXU | S_IRWXG;
    /* mutex */
    mutex_id = shm_open(MYMUTEX, O_CREAT | O_RDWR | O_TRUNC, mode);
    if (mutex_id < 0) {
        perror("shm_open failed with " MYMUTEX);
        return -1;
    }
    if (ftruncate(mutex_id, sizeof(pthread_mutex_t)) == -1) {
        perror("ftruncate failed with " MYMUTEX);
        return -1;
    }
    mutex = (pthread_mutex_t *) mmap(NULL, sizeof(pthread_mutex_t), PROT_READ | PROT_WRITE, MAP_SHARED, mutex_id, 0);
    if (mutex == MAP_FAILED) {
        perror("mmap failed with " MYMUTEX);
        return -1;
    }
    /* cond */
    cond_id = shm_open(MYCOND, O_CREAT | O_RDWR | O_TRUNC, mode);
    if (cond_id < 0) {
        perror("shm_open failed with " MYCOND);
        return -1;
    }
    if (ftruncate(cond_id, sizeof(pthread_cond_t)) == -1) {
        perror("ftruncate failed with " MYCOND);
        return -1;
    }
    cond = (pthread_cond_t *) mmap(NULL, sizeof(pthread_cond_t), PROT_READ | PROT_WRITE, MAP_SHARED, cond_id, 0);
    if (cond == MAP_FAILED) {
        perror("ftruncate failed with " MYCOND);
        return -1;
    }
    /* set mutex shared between processes */
    pthread_mutexattr_t mattr;
    pthread_mutexattr_init(&mattr);
    pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
    pthread_mutex_init(mutex, &mattr);
    pthread_mutexattr_destroy(&mattr);
    /* set condition shared between processes */
    pthread_condattr_t cattr;
    pthread_condattr_init(&cattr);
    pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
    pthread_cond_init(cond, &cattr);
    pthread_condattr_destroy(&cattr);
    /*************************************/
    pid_t pid;
    if ((pid = fork()) < 0) {
        perror("fork failure");
        return -1;
    } else if (pid == 0) { /* child */
        sleep(5);
        pthread_mutex_lock(mutex);
        pthread_cond_signal(cond);
        printf("child signaled\n");
        pthread_mutex_unlock(mutex);
        exit(0);
    } else { /* parent */
        printf("parent waiting on condition\n");
        pthread_mutex_lock(mutex);
        pthread_cond_wait(cond, mutex);
        printf("parent signaled by child, wake up!!!\n");
        pthread_mutex_unlock(mutex);
        pthread_mutex_destroy(mutex);
        pthread_cond_destroy(cond);
        shm_unlink(MYCOND);
        shm_unlink(MYMUTEX);
    }
    return 0;
}

* gcc mutex.c -o mutex -lrt

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

#include <pthread.h>

#include <errno.h>

#include <sys/types.h>

#include <sys/mman.h>

#include <sys/stat.h>

#include <fcntl.h>

#define MYMUTEX "/mymutex"

#define MYCOND "/mycond"

int main(int argc, char* argv[])

{

pthread_cond_t *cond;

pthread_mutex_t *mutex;

int cond_id, mutex_id;

int mode = S_IRWXU | S_IRWXG;

/* mutex */

mutex_id = shm_open(MYMUTEX, O_CREAT | O_RDWR | O_TRUNC, mode);

if (mutex_id < 0) {

perror("shm_open failed with " MYMUTEX);

return -1;

}

if (ftruncate(mutex_id, sizeof(pthread_mutex_t)) == -1) {

perror("ftruncate failed with " MYMUTEX);

return -1;

}

mutex = (pthread_mutex_t *) mmap(NULL, sizeof(pthread_mutex_t), PROT_READ | PROT_WRITE, MAP_SHARED, mutex_id, 0);

if (mutex == MAP_FAILED) {

perror("mmap failed with " MYMUTEX);

return -1;

}

/* cond */

cond_id = shm_open(MYCOND, O_CREAT | O_RDWR | O_TRUNC, mode);

if (cond_id < 0) {

perror("shm_open failed with " MYCOND);

return -1;

}

if (ftruncate(cond_id, sizeof(pthread_cond_t)) == -1) {

perror("ftruncate failed with " MYCOND);

return -1;

}

cond = (pthread_cond_t *) mmap(NULL, sizeof(pthread_cond_t), PROT_READ | PROT_WRITE, MAP_SHARED, cond_id, 0);

if (cond == MAP_FAILED) {

perror("ftruncate failed with " MYCOND);

return -1;

}

/* set mutex shared between processes */

pthread_mutexattr_t mattr;

pthread_mutexattr_init(&mattr);

pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);

pthread_mutex_init(mutex, &mattr);

pthread_mutexattr_destroy(&mattr);

/* set condition shared between processes */

pthread_condattr_t cattr;

pthread_condattr_init(&cattr);

pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);

pthread_cond_init(cond, &cattr);

pthread_condattr_destroy(&cattr);

/*************************************/

pid_t pid;

if ((pid = fork()) < 0) {

perror("fork failure");

return -1;

} else if (pid == 0) { /* child */

sleep(5);

pthread_mutex_lock(mutex);

pthread_cond_signal(cond);

printf("child signaled\n");

pthread_mutex_unlock(mutex);

exit(0);

} else { /* parent */

printf("parent waiting on condition\n");

pthread_mutex_lock(mutex);

pthread_cond_wait(cond, mutex);

printf("parent signaled by child, wake up!!!\n");

pthread_mutex_unlock(mutex);

pthread_mutex_destroy(mutex);

pthread_cond_destroy(cond);

shm_unlink(MYCOND);

shm_unlink(MYMUTEX);

}

return 0;

}

Shared memory is used to share the mutex and condition variable.

NOTE: The process-shared mutex attribute isn’t universally supported yet. You should confirm before using them.

Updated Oct 14, 2020: Fixed pthread_mutexattr_t and pthread_condattr_t initialization.

Pre/Post-main Function Call Implementation in C

Posted on February 13, 2014 by gonwan — No Comments ↓

In C++, pre/post-main function call can be implemented using a global class instance. Its constructor and destructor are invoked automatically before and after the main function. But in C, no such mechanism. Actually, there’s a glib implementation that can help. You may want to read my previous post about CRT sections of MSVC. I just copy the code and do some renaming:

#include <stdlib.h>
#if defined (_MSC_VER)
#if (_MSC_VER >= 1500)
/* Visual Studio 2008 and later have __pragma */
#define HAS_CONSTRUCTORS
#define DEFINE_CONSTRUCTOR(_func) \
    static void _func(void); \
    static int _func ## _wrapper(void) { _func(); return 0; } \
    __pragma(section(".CRT$XCU",read)) \
    __declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _wrapper;
#define DEFINE_DESTRUCTOR(_func) \
    static void _func(void); \
    static int _func ## _constructor(void) { atexit (_func); return 0; } \
    __pragma(section(".CRT$XCU",read)) \
    __declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _constructor;
#elif (_MSC_VER >= 1400)
/* Visual Studio 2005 */
#define HAS_CONSTRUCTORS
#pragma section(".CRT$XCU",read)
#define DEFINE_CONSTRUCTOR(_func) \
    static void _func(void); \
    static int _func ## _wrapper(void) { _func(); return 0; } \
    __declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _wrapper;
#define DEFINE_DESTRUCTOR(_func) \
    static void _func(void); \
    static int _func ## _constructor(void) { atexit (_func); return 0; } \
    __declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _constructor;
#else
/* Visual Studio 2003 and early versions should use #pragma code_seg() to define pre/post-main functions. */
#error Pre/Post-main function not supported on your version of Visual Studio.
#endif
#elif (__GNUC__ > 2) || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)
#define HAS_CONSTRUCTORS
#define DEFINE_CONSTRUCTOR(_func) static void __attribute__((constructor)) _func (void);
#define DEFINE_DESTRUCTOR(_func) static void __attribute__((destructor)) _func (void);
#else
/* not supported */
#endif

#include <stdlib.h>

#if defined (_MSC_VER)

#if (_MSC_VER >= 1500)

/* Visual Studio 2008 and later have __pragma */

#define HAS_CONSTRUCTORS

#define DEFINE_CONSTRUCTOR(_func) \

static void _func(void); \

static int _func ## _wrapper(void) { _func(); return 0; } \

__pragma(section(".CRT$XCU",read)) \

__declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _wrapper;

#define DEFINE_DESTRUCTOR(_func) \

static void _func(void); \

static int _func ## _constructor(void) { atexit (_func); return 0; } \

__pragma(section(".CRT$XCU",read)) \

__declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _constructor;

#elif (_MSC_VER >= 1400)

/* Visual Studio 2005 */

#define HAS_CONSTRUCTORS

#pragma section(".CRT$XCU",read)

#define DEFINE_CONSTRUCTOR(_func) \

static void _func(void); \

static int _func ## _wrapper(void) { _func(); return 0; } \

__declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _wrapper;

#define DEFINE_DESTRUCTOR(_func) \

static void _func(void); \

static int _func ## _constructor(void) { atexit (_func); return 0; } \

__declspec(allocate(".CRT$XCU")) static int (* _array ## _func)(void) = _func ## _constructor;

#else

/* Visual Studio 2003 and early versions should use #pragma code_seg() to define pre/post-main functions. */

#error Pre/Post-main function not supported on your version of Visual Studio.

#endif

#elif (__GNUC__ > 2) || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)

#define HAS_CONSTRUCTORS

#define DEFINE_CONSTRUCTOR(_func) static void __attribute__((constructor)) _func (void);

#define DEFINE_DESTRUCTOR(_func) static void __attribute__((destructor)) _func (void);

#else

/* not supported */

#endif

One limitation in glib code is the lack of support for VS2003 and early versions. #pragma code_seg() is used to implement the same function:

/*
 * cl ctor.c
 * gcc ctor.c -o ctor
 */
#include "ctor.h"
#include <stdio.h>

#ifdef HAS_CONSTRUCTORS
DEFINE_CONSTRUCTOR(before)
DEFINE_DESTRUCTOR(after)
#else
#ifdef _MSC_VER
static void before(void);
static void after(void);
#pragma data_seg(".CRT$XCU")
static void (*msc_ctor)(void) = before;
#pragma data_seg(".CRT$XPU")
static void (*msc_dtor)(void) = after;
#pragma data_seg()
#endif
#endif

void before()
{
    printf("before main\n");
}

void after()
{
    printf("after main\n");
}

int main()
{
    printf("in main\n");
    return 0;
}

* cl ctor.c

* gcc ctor.c -o ctor

#include "ctor.h"

#include <stdio.h>

#ifdef HAS_CONSTRUCTORS

DEFINE_CONSTRUCTOR(before)

DEFINE_DESTRUCTOR(after)

#else

#ifdef _MSC_VER

static void before(void);

static void after(void);

#pragma data_seg(".CRT$XCU")

static void (*msc_ctor)(void) = before;

#pragma data_seg(".CRT$XPU")

static void (*msc_dtor)(void) = after;

#pragma data_seg()

#endif

void before()

{

printf("before main\n");

}

void after()

{

printf("after main\n");

}

int main()

{

printf("in main\n");

return 0;

}

Output from msvc/gcc:

before main
in main
after main

before main

in main

after main

MSVC CRT Initialization

Posted on February 13, 2014 by gonwan — No Comments ↓

This post provides a detailed view of the MSDN article CRT Initialization. Just paste some content here:

The CRT obtains the list of function pointers from the Visual C++ compiler. When the compiler sees a global initializer, it generates a dynamic initializer in the .CRT$XCU section (where CRT is the section name and XCU is the group name). To obtain a list of those dynamic initializers run the command dumpbin /all main.obj, and then search the .CRT$XCU section (when main.cpp is compiled as a C++ file, not a C file).

The CRT defines two pointers:
– __xc_a in .CRT$XCA
– __xc_z in .CRT$XCZ

Both groups do not have any other symbols defined except __xc_a and __xc_z. Now, when the linker reads various .CRT groups, it combines them in one section and orders them alphabetically. This means that the user-defined global initializers (which the Visual C++ compiler puts in .CRT$XCU) will always come after .CRT$XCA and before .CRT$XCZ.

So, the CRT library uses both __xc_a and __xc_z to determine the start and end of the global initializers list because of the way in which they are laid out in memory after the image is loaded.

Let’s run our VS debugger to further investigate the CRT implementation. I’m using VS2010, and a global instance of class A is declared and initialized:

class A
{
public:
    A();
    ~A();
};

A::A()
{
    std::cout << "in A::A()" << std::endl;
}

A::~A()
{
    std::cout << "in A::~A()" << std::endl;
}

A a;

class A

{

public:

A();

~A();

};

A::A()

{

std::cout << "in A::A()" << std::endl;

}

A::~A()

{

std::cout << "in A::~A()" << std::endl;

}

A a;

Now set the breakpoints in the constructor and destructor, and start debugging. I’ve tried exe/dll and dynamic/static CRT combinations to view the call stacks:

1) exe with crt dynamic linked:
  crtexe.c: (w)mainCRTStartup()
    +--> crtexe.c: __tmainCRTStartup()
           +--> crt0dat.c: _initterm()
2) exe with crt static linked:
  crt0.c: _tmainCRTStartup()
    +--> crt0.c: __tmainCRTStartup()
           +--> crt0dat.c: _cinit()
                  +--> crt0dat.c: _initterm()
3) dll with crt dynamic linked:
  crtdll.c: _DllMainCRTStartup()
    +--> crtdll.c: __DllMainCRTStartup()
           +--> crtdll.c: _CRT_INIT()
                  +--> crt0dat.c: _initterm()
4) dll with crt static linked:
  dllcrt0.c: _DllMainCRTStartup()
    +--> dllcrt0.c: __DllMainCRTStartup()
           +--> dllcrt0.c: _CRT_INIT()
                  +--> crt0dat.c: _cinit()
                         +--> crt0dat.c: _initterm()

1) exe with crt dynamic linked:

crtexe.c: (w)mainCRTStartup()

+--> crtexe.c: __tmainCRTStartup()

+--> crt0dat.c: _initterm()

2) exe with crt static linked:

crt0.c: _tmainCRTStartup()

+--> crt0.c: __tmainCRTStartup()

+--> crt0dat.c: _cinit()

+--> crt0dat.c: _initterm()

3) dll with crt dynamic linked:

crtdll.c: _DllMainCRTStartup()

+--> crtdll.c: __DllMainCRTStartup()

+--> crtdll.c: _CRT_INIT()

+--> crt0dat.c: _initterm()

4) dll with crt static linked:

dllcrt0.c: _DllMainCRTStartup()

+--> dllcrt0.c: __DllMainCRTStartup()

+--> dllcrt0.c: _CRT_INIT()

+--> crt0dat.c: _cinit()

+--> crt0dat.c: _initterm()

_initterm is defined as follow. It is used to walk through __xc_a and __xc_z mentioned above:

// crt0dat.c
void __cdecl _initterm (
        _PVFV * pfbegin,
        _PVFV * pfend
        )
{
        /*
         * walk the table of function pointers from the bottom up, until
         * the end is encountered.  Do not skip the first entry.  The initial
         * value of pfbegin points to the first valid entry.  Do not try to
         * execute what pfend points to.  Only entries before pfend are valid.
         */
        while ( pfbegin < pfend )
        {
            /*
             * if current table entry is non-NULL, call thru it.
             */
            if ( *pfbegin != NULL )
                (**pfbegin)();
            ++pfbegin;
        }
}

// crt0dat.c

void __cdecl _initterm (

_PVFV * pfbegin,

_PVFV * pfend

)

{

* walk the table of function pointers from the bottom up, until

* the end is encountered. Do not skip the first entry. The initial

* value of pfbegin points to the first valid entry. Do not try to

* execute what pfend points to. Only entries before pfend are valid.

while ( pfbegin < pfend )

{

* if current table entry is non-NULL, call thru it.

if ( *pfbegin != NULL )

(**pfbegin)();

++pfbegin;

}

__xc_a, __xc_z and other section groups are defined as:

// crt0dat.c
/*
 * pointers to initialization sections
 */
extern _CRTALLOC(".CRT$XIA") _PIFV __xi_a[];
extern _CRTALLOC(".CRT$XIZ") _PIFV __xi_z[];    /* C initializers */
extern _CRTALLOC(".CRT$XCA") _PVFV __xc_a[];
extern _CRTALLOC(".CRT$XCZ") _PVFV __xc_z[];    /* C++ initializers */
extern _CRTALLOC(".CRT$XPA") _PVFV __xp_a[];
extern _CRTALLOC(".CRT$XPZ") _PVFV __xp_z[];    /* C pre-terminators */
extern _CRTALLOC(".CRT$XTA") _PVFV __xt_a[];
extern _CRTALLOC(".CRT$XTZ") _PVFV __xt_z[];    /* C terminators */
// sect_attribs.h
#define _CRTALLOC(x) __declspec(allocate(x))

// crt0dat.c

* pointers to initialization sections

extern _CRTALLOC(".CRT$XIA") _PIFV __xi_a[];

extern _CRTALLOC(".CRT$XIZ") _PIFV __xi_z[]; /* C initializers */

extern _CRTALLOC(".CRT$XCA") _PVFV __xc_a[];

extern _CRTALLOC(".CRT$XCZ") _PVFV __xc_z[]; /* C++ initializers */

extern _CRTALLOC(".CRT$XPA") _PVFV __xp_a[];

extern _CRTALLOC(".CRT$XPZ") _PVFV __xp_z[]; /* C pre-terminators */

extern _CRTALLOC(".CRT$XTA") _PVFV __xt_a[];

extern _CRTALLOC(".CRT$XTZ") _PVFV __xt_z[]; /* C terminators */

// sect_attribs.h

#define _CRTALLOC(x) __declspec(allocate(x))

gcc uses similar technology to deal with pre/post-main stuff. The section names are .init and .fini .

Exception Safety with shared_ptr

Posted on February 10, 2014 by gonwan — No Comments ↓

Code snippet:

#include <iostream>
#include <boost/shared_ptr.hpp>

class A {
public:
    A() { std::cout << "in A::A()." << std::endl; }
    ~A() { std::cout << "in A::~A()." << std::endl; }
};

class B {
public:
    B() { std::cout << "in B::B()." << std::endl; throw 1024; }
    ~B() { std::cout << "in B::~B()." << std::endl; }
};

class C {
public:
    C() : m_a(new A), m_b(new B) { }
#ifndef _USE_SHARED_PTR
    ~C() { delete m_b; delete m_a; }
#endif
private:
#ifndef _USE_SHARED_PTR
    A *m_a;
    B *m_b;
#else
    boost::shared_ptr<A> m_a;
    boost::shared_ptr<B> m_b;
#endif
};

int main() {
    try { C c; } catch (...) { }
    return 0;
}

#include <iostream>

#include <boost/shared_ptr.hpp>

class A {

public:

A() { std::cout << "in A::A()." << std::endl; }

~A() { std::cout << "in A::~A()." << std::endl; }

};

class B {

public:

B() { std::cout << "in B::B()." << std::endl; throw 1024; }

~B() { std::cout << "in B::~B()." << std::endl; }

};

class C {

public:

C() : m_a(new A), m_b(new B) { }

#ifndef _USE_SHARED_PTR

~C() { delete m_b; delete m_a; }

#endif

private:

#ifndef _USE_SHARED_PTR

A *m_a;

B *m_b;

#else

boost::shared_ptr<A> m_a;

boost::shared_ptr<B> m_b;

#endif

};

int main() {

try { C c; } catch (...) { }

return 0;

}

Output:

binson@binson-precise:~$ g++ ptr.cpp -o ptr
binson@binson-precise:~$ ./ptr
in A::A().
in B::B().
binson@binson-precise:~$ g++ -D_USE_SHARED_PTR ptr.cpp -o ptr
binson@binson-precise:~$ ./ptr
in A::A().
in B::B().
in A::~A().

binson@binson-precise:~$ g++ ptr.cpp -o ptr

binson@binson-precise:~$ ./ptr

in A::A().

in B::B().

binson@binson-precise:~$ g++ -D_USE_SHARED_PTR ptr.cpp -o ptr

binson@binson-precise:~$ ./ptr

in A::A().

in B::B().

in A::~A().

Exception safety is ensured, when using shared_ptr. Memory allocated by m_a is freed even when an exception is thrown. The trick is: the destructor of class shared_ptr is invoked after the destructor of class C.

Compiler Intrinsic Functions

Posted on October 30, 2013 by gonwan — No Comments ↓

Copied from Wikipedia:

An intrinsic function is a function available for use in a given programming language whose implementation is handled specially by the compiler. Typically, it substitutes a sequence of automatically generated instructions for the original function call, similar to an inline function. Unlike an inline function though, the compiler has an intimate knowledge of the intrinsic function and can therefore better integrate it and optimize it for the situation. This is also called builtin function in many languages.

A code snippet is written to check the code generation when intrinsic is enabled or not:

/*
 * # gcc -S intrinsic.c -o intrinsic.s
 * # gcc -S -fno-builtin intrinsic.c -o intrinsic2.s
 * # cl /c /Oi intrinsic.c /FAs /Faintrinsic.asm
 * # cl /c intrinsic.c /FAs /Faintrinsic2.asm
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const char *c = "Hello World!";
char c2[16];

int main(int argc, char *argv[])
{
    int a = abs(argc);
    memcpy(c2, c, 12);
    printf("%d,%s\n", a, c2);
    return 0;
}

* # gcc -S intrinsic.c -o intrinsic.s

* # gcc -S -fno-builtin intrinsic.c -o intrinsic2.s

* # cl /c /Oi intrinsic.c /FAs /Faintrinsic.asm

* # cl /c intrinsic.c /FAs /Faintrinsic2.asm

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

const char *c = "Hello World!";

char c2[16];

int main(int argc, char *argv[])

{

int a = abs(argc);

memcpy(c2, c, 12);

printf("%d,%s\n", a, c2);

return 0;

}

Generated assembly:

main:
    pushl   %ebp
    movl    %esp, %ebp
    andl    $-16, %esp
    subl    $32, %esp
    movl    8(%ebp), %eax
    sarl    $31, %eax
    movl    %eax, %edx
    xorl    8(%ebp), %edx
    movl    %edx, 28(%esp)
    subl    %eax, 28(%esp)
    movl    c, %eax
    movl    %eax, %edx
    movl    $c2, %eax
    movl    (%edx), %ecx
    movl    %ecx, (%eax)
    movl    4(%edx), %ecx
    movl    %ecx, 4(%eax)
    movl    8(%edx), %edx
    movl    %edx, 8(%eax)
    movl    $.LC1, %eax
    movl    $c2, 8(%esp)
    movl    28(%esp), %edx
    movl    %edx, 4(%esp)
    movl    %eax, (%esp)
    call    printf
    movl    $0, %eax
    leave
    ret

main:

pushl %ebp

movl %esp, %ebp

andl $-16, %esp

subl $32, %esp

movl 8(%ebp), %eax

sarl $31, %eax

movl %eax, %edx

xorl 8(%ebp), %edx

movl %edx, 28(%esp)

subl %eax, 28(%esp)

movl c, %eax

movl %eax, %edx

movl $c2, %eax

movl (%edx), %ecx

movl %ecx, (%eax)

movl 4(%edx), %ecx

movl %ecx, 4(%eax)

movl 8(%edx), %edx

movl %edx, 8(%eax)

movl $.LC1, %eax

movl $c2, 8(%esp)

movl 28(%esp), %edx

movl %edx, 4(%esp)

movl %eax, (%esp)

call printf

movl $0, %eax

leave

ret

Only printf() is in code. No abs() nor memcpy(). Since they are intrinsic, as listed here in gcc’s online document.

Intrinsic can be explicitly disabled. For instance, CRT intrinsic must be disabled for kernel development. Add -fno-builtin flag to gcc, or remove /Oi switch in MSVC. Only paste the generated code in gcc case here:

main:
    pushl   %ebp
    movl    %esp, %ebp
    andl    $-16, %esp
    subl    $32, %esp
    movl    8(%ebp), %eax
    movl    %eax, (%esp)
    call    abs
    movl    %eax, 28(%esp)
    movl    c, %eax
    movl    %eax, %edx
    movl    $c2, %eax
    movl    $12, 8(%esp)
    movl    %edx, 4(%esp)
    movl    %eax, (%esp)
    call    memcpy
    movl    $.LC1, %eax
    movl    $c2, 8(%esp)
    movl    28(%esp), %edx
    movl    %edx, 4(%esp)
    movl    %eax, (%esp)
    call    printf
    movl    $0, %eax
    leave
    ret

main:

pushl %ebp

movl %esp, %ebp

andl $-16, %esp

subl $32, %esp

movl 8(%ebp), %eax

movl %eax, (%esp)

call abs

movl %eax, 28(%esp)

movl c, %eax

movl %eax, %edx

movl $c2, %eax

movl $12, 8(%esp)

movl %edx, 4(%esp)

movl %eax, (%esp)

call memcpy

movl $.LC1, %eax

movl $c2, 8(%esp)

movl 28(%esp), %edx

movl %edx, 4(%esp)

movl %eax, (%esp)

call printf

movl $0, %eax

leave

ret

There _are_ abs() and memcpy() now. General MSVC intrinsic can be found here.

Intrinsic is easier than inline assembly. It is used to increase performance in most cases. Both gcc and MSVC provide intrinsic support for Intel’s MMX, SSE and SSE2 instrument set. Code snippet to use MMX:

/*
 * # gcc -O2 -S -mmmx intrinsic_mmx.c -o intrinsic_mmx.s
 * # cl /O2 /c intrinsic_mmx.c /FAs /Faintrinsic_mmx.asm
 */
#include <stdio.h>
#include <mmintrin.h>

int main()
{
    __m64 m1, m2, m3;
    int out1, out2;
    int in1[] = { 222, 111 };
    int in2[] = { 444, 333 };
#if 0
    m1 = _mm_setr_pi32(in1[0], in1[1]);
    m2 = _mm_setr_pi32(in2[0], in2[1]);
#else
    m1 = *(__m64 *)in1;
    m2 = *(__m64 *)in2;
#endif
    m3 = _mm_add_pi32(m1, m2); 
    out1 = _mm_cvtsi64_si32(m3);
    m3  = _mm_srli_si64(m3, 32);
    out2 = _mm_cvtsi64_si32(m3);
    _mm_empty();
    printf("out1=%d,out2=%d\n", out1, out2);
    return 0;
}

* # gcc -O2 -S -mmmx intrinsic_mmx.c -o intrinsic_mmx.s

* # cl /O2 /c intrinsic_mmx.c /FAs /Faintrinsic_mmx.asm

#include <stdio.h>

#include <mmintrin.h>

int main()

{

__m64 m1, m2, m3;

int out1, out2;

int in1[] = { 222, 111 };

int in2[] = { 444, 333 };

#if 0

m1 = _mm_setr_pi32(in1[0], in1[1]);

m2 = _mm_setr_pi32(in2[0], in2[1]);

#else

m1 = *(__m64 *)in1;

m2 = *(__m64 *)in2;

#endif

m3 = _mm_add_pi32(m1, m2);

out1 = _mm_cvtsi64_si32(m3);

m3 = _mm_srli_si64(m3, 32);

out2 = _mm_cvtsi64_si32(m3);

_mm_empty();

printf("out1=%d,out2=%d\n", out1, out2);

return 0;

}

Assembly looks like:

main:
    pushl   %ebp
    movl    %esp, %ebp
    andl    $-16, %esp
    subl    $16, %esp
    movq    .LC1, %mm0
    paddd   .LC2, %mm0
    movd    %mm0, 8(%esp)
    psrlq   $32, %mm0
    movd    %mm0, 12(%esp)
    emms
    movl    $.LC0, 4(%esp)
    movl    $1, (%esp)
    call    __printf_chk
    xorl    %eax, %eax
    leave
    ret

main:

pushl %ebp

movl %esp, %ebp

andl $-16, %esp

subl $16, %esp

movq .LC1, %mm0

paddd .LC2, %mm0

movd %mm0, 8(%esp)

psrlq $32, %mm0

movd %mm0, 12(%esp)

emms

movl $.LC0, 4(%esp)

movl $1, (%esp)

call __printf_chk

xorl %eax, %eax

leave

ret

You see MMX registers and instruments this time. -mmmx flag is required to build for gcc. MSVC also generate similar code. Reference for these instrument set is available on Intel’s website.

A simple benchmark to use SSE is avalable here.

Jump Instruments and EFLAGS

Posted on October 29, 2013 by gonwan — No Comments ↓

There was a misleading in my knowledge of a conditional jump: It checks only the result of CMP and TEST instruments. So when it appears after other instruments like ADD or SUB, I can find no clue on how it works.

Actually, a conditional jump checks flags in the EFLAGS control register. From Intel’s manual, vol 1, 3.4.3:

The status flags (bits 0, 2, 4, 6, 7, and 11) of the EFLAGS register indicate the results of arithmetic instructions, such as the ADD, SUB, MUL, and DIV instructions. The status flag functions are:

CF (bit 0) Carry flag: Set if an arithmetic operation generates a carry or a borrow out of the most-significant bit of the result; cleared otherwise. This flag indicates an overflow condition for unsigned-integer arithmetic. It is also used in multiple-precision arithmetic.

PF (bit 2) Parity flag: Set if the least-significant byte of the result contains an even number of 1 bits; cleared otherwise.
AF (bit 4) Adjust flag: Set if an arithmetic operation generates a carry or a borrow out of bit 3 of the result; cleared otherwise. This flag is used in binary-coded decimal (BCD) arithmetic.

ZF (bit 6) Zero flag: Set if the result is zero; cleared otherwise.

SF (bit 7) Sign flag: Set equal to the most-significant bit of the result, which is the sign bit of a signed integer. (0 indicates a positive value and 1 indicates a negative value.)

OF (bit 11) Overflow flag: Set if the integer result is too large a positive number or too small a negative number (excluding the sign-bit) to fit in the destination operand; cleared otherwise. This flag indicates an overflow condition for signed-integer (two’s complement) arithmetic.

And again from vol 2a, section Jcc Jump if Condition is met, more details. I just copy content from here:

Instruction	Description	signed?	Flags	short jump opcodes	near jump opcodes
JO	Jump if overflow		OF = 1	70	0F 80
JNO	Jump if not overflow		OF = 0	71	0F 81
JS	Jump if sign		SF = 1	78	0F 88
JNS	Jump if not sign		SF = 0	79	0F 89
JE JZ	Jump if equal Jump if zero		ZF = 1	74	0F 84
JNE JNZ	Jump if not equal Jump if not zero		ZF = 0	75	0F 85
JB JNAE JC	Jump if below Jump if not above or equal Jump if carry	unsigned	CF = 1	72	0F 82
JNB JAE JNC	Jump if not below Jump if above or equal Jump if not carry	unsigned	CF = 0	73	0F 83
JBE JNA	Jump if below or equal Jump if not above	unsigned	CF = 1 or ZF = 1	76	0F 86
JA JNBE	Jump if above Jump if not below or equal	unsigned	CF = 0 and ZF = 0	77	0F 87
JL JNGE	Jump if less Jump if not greater or equal	signed	SF <> OF	7C	0F 8C
JGE JNL	Jump if greater or equal Jump if not less	signed	SF = OF	7D	0F 8D
JLE JNG	Jump if less or equal Jump if not greater	signed	ZF = 1 or SF <> OF	7E	0F 8E
JG JNLE	Jump if greater Jump if not less or equal	signed	ZF = 0 and SF = OF	7F	0F 8F
JP JPE	Jump if parity Jump if parity even		PF = 1	7A	0F 8A
JNP JPO	Jump if not parity Jump if parity odd		PF = 0	7B	0F 8B
JCXZ JECXZ	Jump if %CX register is 0 Jump if %ECX register is 0		%CX = 0 %ECX = 0	E3	E3

There are signed and unsigned versions when comparing: JA Vs JG, JB Vs JL etc.. Let’s take JA and JG to explain the difference. For JA, it’s clear that it requires CF=0(no borrow bit) and ZF=0(not equal). For JG, when two operands are both positive or negative, it requires ZF=0 and SF=OF=0. When two operands have different signs, it requires ZF=0 and the first operand is positive, thus requires SF=OF=1.

Note, the following 2 lines(AT&T syntax) are equivalent. CPU does arithmetic calculation, it does not care about whether it is signed or unsigned. It only set flags. It is we that make the signed or unsigned jump decision.

movl $-1, %eax
movl $0xffffffff, %eax

1 2	movl $-1, %eax movl $0xffffffff, %eax

Last, I’d like to use ndisasm(install nasm package to get it) to illustrate how jump instruments are encoded, including short jump, near jump and far jump:

# echo -e "\x74\x00" | ndisasm -
00000000  7400              jz 0x2
00000002  0A                db 0x0a
# echo -e "\x74\xfe" | ndisasm -
00000000  74FE              jz 0x0
00000002  0A                db 0x0a
# echo -e "\x0f\x84\x00\x00" | ndisasm -
00000000  0F840000          jz word 0x4
00000004  0A                db 0x0a
# echo -e "\x0f\x84\xfc\xff" | ndisasm -
00000000  0F84FCFF          jz word 0x0
00000004  0A                db 0x0a
# echo -e "\x0f\x84\x00\x00\x00\x00" | ndisasm - -b 32
00000000  0F8400000000      jz dword 0x6
00000006  0A                db 0x0a
# echo -e "\x0f\x84\xfa\xff\xff\xff" | ndisasm - -b 32
00000000  0F84FAFFFFFF      jz dword 0x0
00000006  0A                db 0x0a
# echo -e "\xeb\x00" | ndisasm -
00000000  EB00              jmp short 0x2
00000002  0A                db 0x0a
# echo -e "\xe9\x00\x00" | ndisasm -
00000000  E90000            jmp word 0x3
00000003  0A                db 0x0a
# echo -e "\xe9\x00\x00\x00\x00" | ndisasm - -b32
00000000  E900000000        jmp dword 0x5
00000005  0A                db 0x0a
# echo -e "\xea\x00\x00\x34\x12" | ndisasm -
00000000  EA00003412        jmp word 0x1234:0x0
00000005  0A                db 0x0a
# echo -e "\xea\x00\x00\x00\x00\x34\x12" | ndisasm - -b 32
00000000  EA000000003412    jmp dword 0x1234:0x0
00000007  0A                db 0x0a

# echo -e "\x74\x00" | ndisasm -

00000000 7400 jz 0x2

00000002 0A db 0x0a

# echo -e "\x74\xfe" | ndisasm -

00000000 74FE jz 0x0

00000002 0A db 0x0a

# echo -e "\x0f\x84\x00\x00" | ndisasm -

00000000 0F840000 jz word 0x4

00000004 0A db 0x0a

# echo -e "\x0f\x84\xfc\xff" | ndisasm -

00000000 0F84FCFF jz word 0x0

00000004 0A db 0x0a

# echo -e "\x0f\x84\x00\x00\x00\x00" | ndisasm - -b 32

00000000 0F8400000000 jz dword 0x6

00000006 0A db 0x0a

# echo -e "\x0f\x84\xfa\xff\xff\xff" | ndisasm - -b 32

00000000 0F84FAFFFFFF jz dword 0x0

00000006 0A db 0x0a

# echo -e "\xeb\x00" | ndisasm -

00000000 EB00 jmp short 0x2

00000002 0A db 0x0a

# echo -e "\xe9\x00\x00" | ndisasm -

00000000 E90000 jmp word 0x3

00000003 0A db 0x0a

# echo -e "\xe9\x00\x00\x00\x00" | ndisasm - -b32

00000000 E900000000 jmp dword 0x5

00000005 0A db 0x0a

# echo -e "\xea\x00\x00\x34\x12" | ndisasm -

00000000 EA00003412 jmp word 0x1234:0x0

00000005 0A db 0x0a

# echo -e "\xea\x00\x00\x00\x00\x34\x12" | ndisasm - -b 32

00000000 EA000000003412 jmp dword 0x1234:0x0

00000007 0A db 0x0a

MSVC Inline Assembly

Posted on October 23, 2013 by gonwan — No Comments ↓

MSVC’s inline assembly is easier to use, as compared to gcc’s version. It is easier to write right code than wrong one, I think. Still a simple add function is used to illustrate:

int add1(int a, int b)
{
    return a + b;
}

int add1(int a, int b)

{

return a + b;

}

The corresponding inline version:

int add2(int a, int b)
{
    __asm {
        mov eax, a;
        add eax, b;
    }
}

int add2(int a, int b)

{

__asm {

mov eax, a;

add eax, b;

}

__asm keyword is used to specify a inline assembly block. From MSDN, there is another asm keyword which is not recommended:

Visual C++ support for the Standard C++ asm keyword is limited to the fact that the compiler will not generate an error on the keyword. However, an asm block will not generate any meaningful code. Use __asm instead of asm.

Symbols in C/C++ code can be used directly in inline assembly. This is much more convenient than gcc. And it is also not necessary to load parameters into registers before usage as in gcc. MSVC does the job right even in optimization case.

NOTE: Inline assembly is not supported on the Itanium and x64 processors.

Let’s see the generated code:

# cl /c /FA testasm_windows.c

1	# cl /c /FA testasm_windows.c

Output:

PUBLIC _add2
_TEXT SEGMENT
_a$ = 8
_b$ = 12
_add2 PROC
 push ebp
 mov ebp, esp
 mov eax, DWORD PTR _a$[ebp]
 add eax, DWORD PTR _b$[ebp]
 pop ebp
 ret 0
_add2 ENDP
_TEXT ENDS

PUBLIC _add2

_TEXT SEGMENT

_a$ = 8

_b$ = 12

_add2 PROC

push ebp

mov ebp, esp

mov eax, DWORD PTR _a$[ebp]

add eax, DWORD PTR _b$[ebp]

pop ebp

ret 0

_add2 ENDP

_TEXT ENDS

Function parameters are located in [ebp+12] and [ebp+8] as referred by symbol a and b. Then, what happened if registers other than scratch registers are specified?

int add3(int a, int b)
{
    __asm {
        mov ebx, a;
        add ebx, b;
        mov eax, ebx;
    }
}

int add3(int a, int b)

{

__asm {

mov ebx, a;

add ebx, b;

mov eax, ebx;

}

Output assembly code:

PUBLIC _add3
_TEXT SEGMENT
_a$ = 8
_b$ = 12
_add3 PROC
 push ebp
 mov ebp, esp
 push ebx
 mov ebx, DWORD PTR _a$[ebp]
 add ebx, DWORD PTR _b$[ebp]
 mov eax, ebx
 pop ebx
 pop ebp
 ret 0
_add3 ENDP
_TEXT ENDS

PUBLIC _add3

_TEXT SEGMENT

_a$ = 8

_b$ = 12

_add3 PROC

push ebp

mov ebp, esp

push ebx

mov ebx, DWORD PTR _a$[ebp]

add ebx, DWORD PTR _b$[ebp]

mov eax, ebx

pop ebx

pop ebp

ret 0

_add3 ENDP

_TEXT ENDS

As you see, MSVC automatically preserves ebx for us. From MSDN:

When using __asm to write assembly language in C/C++ functions, you don’t need to preserve the EAX, EBX, ECX, EDX, ESI, or EDI registers.

Let’s see the case when stdcall calling convention is used:

int __stdcall add4(int a, int b)
{
    __asm {
        mov eax, a;
        add eax, b;
    }
}

int __stdcall add4(int a, int b)

{

__asm {

mov eax, a;

add eax, b;

}

Output:

PUBLIC _add4@8
_TEXT SEGMENT
_a$ = 8
_b$ = 12
_add4@8 PROC
 push ebp
 mov ebp, esp
 mov eax, DWORD PTR _a$[ebp]
 add eax, DWORD PTR _b$[ebp]
 pop ebp
 ret 8
_add4@8 ENDP
_TEXT ENDS

PUBLIC _add4@8

_TEXT SEGMENT

_a$ = 8

_b$ = 12

_add4@8 PROC

push ebp

mov ebp, esp

mov eax, DWORD PTR _a$[ebp]

add eax, DWORD PTR _b$[ebp]

pop ebp

ret 8

_add4@8 ENDP

_TEXT ENDS

In stdcall, stack is cleaned up by callee. So, there’s a ret 8 before return. And the function name is mangled to _add4@8.

MSVC also supports fastcall calling convention, but it causes register conflicts as mentioned on MSDN, and is not recommended. Just test it here, the code happens to work:)

int __fastcall add5(int a, int b)
{
    __asm {
        mov eax, a;
        add eax, b;
    }
}

int __fastcall add5(int a, int b)

{

__asm {

mov eax, a;

add eax, b;

}

Output:

PUBLIC @add5@8
_TEXT SEGMENT
_b$ = -8
_a$ = -4
@add5@8 PROC
 push ebp
 mov ebp, esp
 sub esp, 8
 mov DWORD PTR _b$[ebp], edx
 mov DWORD PTR _a$[ebp], ecx
 mov eax, DWORD PTR _a$[ebp]
 add eax, DWORD PTR _b$[ebp]
 mov esp, ebp
 pop ebp
 ret 0
@add5@8 ENDP
_TEXT ENDS

PUBLIC @add5@8

_TEXT SEGMENT

_b$ = -8

_a$ = -4

@add5@8 PROC

push ebp

mov ebp, esp

sub esp, 8

mov DWORD PTR _b$[ebp], edx

mov DWORD PTR _a$[ebp], ecx

mov eax, DWORD PTR _a$[ebp]

add eax, DWORD PTR _b$[ebp]

mov esp, ebp

pop ebp

ret 0

@add5@8 ENDP

_TEXT ENDS

Function parameters are passed in ecx and edx when using fastcall. But they are saved to stack first. It seems we get no benefit using this calling convention. Maybe MSVC does not implement it well. The function name is mangled to @add5@8.

Last, we can tell MSVC that we want to write our own prolog/epilog code sequences using __declspec(naked) directive:

__declspec(naked) int __cdecl add6(int a, int b)
{
    __asm {
        push ebp;
        mov ebp, esp;
        mov eax, a;
        add eax, b;
        pop ebp;
        ret;
    }
}

__declspec(naked) int __cdecl add6(int a, int b)

{

__asm {

push ebp;

mov ebp, esp;

mov eax, a;

add eax, b;

pop ebp;

ret;

}

Output:

PUBLIC _add6
_TEXT SEGMENT
_a$ = 8
_b$ = 12
_add6 PROC
 push ebp
 mov ebp, esp
 mov eax, DWORD PTR _a$[ebp]
 add eax, DWORD PTR _b$[ebp]
 pop ebp
 ret 0
_add6 ENDP
_TEXT ENDS

PUBLIC _add6

_TEXT SEGMENT

_a$ = 8

_b$ = 12

_add6 PROC

push ebp

mov ebp, esp

mov eax, DWORD PTR _a$[ebp]

add eax, DWORD PTR _b$[ebp]

pop ebp

ret 0

_add6 ENDP

_TEXT ENDS

Normal prolog/epilog is used here. MSVC does not generate duplicate these code when using __declspec(naked) directive.

GCC Inline Assembly

Posted on October 22, 2013 by gonwan — No Comments ↓

Inline assembly is used in Linux kernel to optimize performance or access hardware. So I decided to check it first. Before digging deeper, you may wanna read the GCC Inline Assembly HOWTO to get a general understanding. In C, a simple add function looks like:

int add1(int a, int b)
{
    return a + b;
}

int add1(int a, int b)

{

return a + b;

}

Its inline assembly version may be:

int add2(int a, int b)
{
    __asm__ __volatile__ ("movl 12(%ebp), %eax\n\t"
                          "movl 8(%ebp), %edx\n\t"
                          "addl %edx, %eax"
    );
}

int add2(int a, int b)

{

__asm__ __volatile__ ("movl 12(%ebp), %eax\n\t"

"movl 8(%ebp), %edx\n\t"

"addl %edx, %eax"

);

}

Or simpler:

int add3(int a, int b)
{
    __asm__ __volatile__ ("movl 12(%ebp), %eax\n\t"
                          "addl 8(%ebp), %eax"
    );
}

int add3(int a, int b)

{

__asm__ __volatile__ ("movl 12(%ebp), %eax\n\t"

"addl 8(%ebp), %eax"

);

}

Here’s its generated code by gcc:

# gcc -S testasm_linux.c -o testasm_linux.s

1	# gcc -S testasm_linux.c -o testasm_linux.s

Output:

add3:
    pushl   %ebp
    movl    %esp, %ebp
#APP
# 21 "testasm_linux.c" 1
    movl 12(%ebp), %eax
    movl 8(%ebp), %edx
    addl %edx, %eax
# 0 "" 2
#NO_APP
    popl    %ebp
    ret
add3:
    pushl   %ebp
    movl    %esp, %ebp
#APP
# 31 "testasm_linux.c" 1
    movl 12(%ebp), %eax
    addl 8(%ebp), %eax
# 0 "" 2
#NO_APP
    popl    %ebp
    ret

add3:

pushl %ebp

movl %esp, %ebp

#APP

# 21 "testasm_linux.c" 1

movl 12(%ebp), %eax

movl 8(%ebp), %edx

addl %edx, %eax

# 0 "" 2

#NO_APP

popl %ebp

ret

add3:

pushl %ebp

movl %esp, %ebp

#APP

# 31 "testasm_linux.c" 1

movl 12(%ebp), %eax

addl 8(%ebp), %eax

# 0 "" 2

#NO_APP

popl %ebp

ret

Our inline assembly is surrounded by #APP and #NO_APP comments. Redundant gcc directives are already removed, the remaining are just function prolog/epilog code. add2() and add3() works fine using default gcc flags. But it is not the case when -O2 optimize flag is passed. From the output of gcc -S -O2(try it yourself), I found these 2 function calls are inlined in their caller, no function call at all. These 2 issues prevent the inline assembly from working: – Depending on %eax to be the return value. But it is silently ignored in -O2. – Depending on 12(%ebp) and 8(%ebp) as parameters of function. But it is not guaranteed that parameters are there in -O2. To solve issue 1, an explicit return should be used:

int add4(int a, int b)
{
    int res;
    /* note the double % */
    __asm__ __volatile__ ("movl 12(%%ebp), %%eax\n\t"
                          "addl 8(%%ebp), %%eax"
                          : "=a" (res)
    );
    return res;
}

int add4(int a, int b)

{

int res;

/* note the double % */

__asm__ __volatile__ ("movl 12(%%ebp), %%eax\n\t"

"addl 8(%%ebp), %%eax"

: "=a" (res)

);

return res;

}

To solve issue 2, parameters are required to be loaded in registers first:

int add5(int a, int b)
{
    int res;
    __asm__ __volatile__ ("movl %%ecx, %%eax\n\t"
                          "addl %%edx, %%eax"
                          : "=a" (res)
                          : "c" (a), "d" (b)
    );
    return res;
}

int add5(int a, int b)

{

int res;

__asm__ __volatile__ ("movl %%ecx, %%eax\n\t"

"addl %%edx, %%eax"

: "=a" (res)

: "c" (a), "d" (b)

);

return res;

}

add5() now works in -O2. The default calling convention is cdecl for gcc. %eax, %ecx and %edx can be used from scratch in a function. It’s the function caller’s duty to preserve these registers. These registers are so-called scratch registers. So what if we specify to use other registers other than these scratch registers, like %esi and %edi?

int add6(int a, int b)
{
    int res;
    __asm__ __volatile__ ("movl %%esi, %%eax\n\t"
                          "addl %%edi, %%eax"
                          : "=a" (res)
                          : "S" (a), "D" (b)
    );
    return res;
}

int add6(int a, int b)

{

int res;

__asm__ __volatile__ ("movl %%esi, %%eax\n\t"

"addl %%edi, %%eax"

: "=a" (res)

: "S" (a), "D" (b)

);

return res;

}

Again with gcc -S:

add6:
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %edi
    pushl   %esi
    pushl   %ebx
    subl    $20, %esp
    movl    8(%ebp), %esi
    movl    %esi, -32(%ebp)
    movl    12(%ebp), %edx
    movl    -32(%ebp), %esi
    movl    %edx, %edi
#APP
# 65 "testasm_linux.c" 1
    movl %esi, %eax
    addl %edi, %eax
# 0 "" 2
#NO_APP
    movl    %eax, %ebx
    movl    %ebx, -16(%ebp)
    movl    -16(%ebp), %eax
    addl    $20, %esp
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret

add6:

pushl %ebp

movl %esp, %ebp

pushl %edi

pushl %esi

pushl %ebx

subl $20, %esp

movl 8(%ebp), %esi

movl %esi, -32(%ebp)

movl 12(%ebp), %edx

movl -32(%ebp), %esi

movl %edx, %edi

#APP

# 65 "testasm_linux.c" 1

movl %esi, %eax

addl %edi, %eax

# 0 "" 2

#NO_APP

movl %eax, %ebx

movl %ebx, -16(%ebp)

movl -16(%ebp), %eax

addl $20, %esp

popl %ebx

popl %esi

popl %edi

popl %ebp

ret

It seems that code generation of gcc in default optimize level is not so efficient:) But you should actually noticed that %esi and %edi are pushed onto stack before their usage, and popped out when finishing. These code generation is automatically done by gcc, since you have specified to use %esi(“S”) and %edi(“D”) in input list of the inline assembly. Actually, the code can be simpler by specify %eax as both input and output:

int add7(int a, int b)
{
    int res;
    __asm__ __volatile__ ("addl %%edx, %%eax"
                          : "=a" (res)
                          : "a" (a), "d" (b)
    );
    return res;
}

int add7(int a, int b)

{

int res;

__asm__ __volatile__ ("addl %%edx, %%eax"

: "=a" (res)

: "a" (a), "d" (b)

);

return res;

}

We can tell gcc to use a general register(“r”) available in current context in inline assembly:

int add8(int a, int b)
{
    int res;
    __asm__ __volatile__ ("movl %1, %%eax\n\t"
                          "addl %2, %%eax"
                          : "=a" (res)
                          : "r" (a), "r" (b)
    );
    return res;
}

int add8(int a, int b)

{

int res;

__asm__ __volatile__ ("movl %1, %%eax\n\t"

"addl %2, %%eax"

: "=a" (res)

: "r" (a), "r" (b)

);

return res;

}

And wrong code generation again…:

add8:
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    subl    $20, %esp
    movl    8(%ebp), %eax
    movl    %eax, -24(%ebp)
    movl    12(%ebp), %edx
    movl    -24(%ebp), %eax
#APP
# 88 "testasm_linux.c" 1
    movl %eax, %eax
    addl %edx, %eax
# 0 "" 2
#NO_APP
    movl    %eax, %ebx
    movl    %ebx, -8(%ebp)
    movl    -8(%ebp), %eax
    addl    $20, %esp
    popl    %ebx
    popl    %ebp
    ret

add8:

pushl %ebp

movl %esp, %ebp

pushl %ebx

subl $20, %esp

movl 8(%ebp), %eax

movl %eax, -24(%ebp)

movl 12(%ebp), %edx

movl -24(%ebp), %eax

#APP

# 88 "testasm_linux.c" 1

movl %eax, %eax

addl %edx, %eax

# 0 "" 2

#NO_APP

movl %eax, %ebx

movl %ebx, -8(%ebp)

movl -8(%ebp), %eax

addl $20, %esp

popl %ebx

popl %ebp

ret

%eax is moved to %eax? gcc selected %eax and %edx as general registers to use. The code accidentally does the right job, but it is still a potential pitfall. Clobber list can be used to avoid this:

int add9(int a, int b)
{
    int res;
    /*
     * The clobber list tells gcc which registers(or memory) are changed by the asm,
     * but not listed as an output.
     */
    __asm__ __volatile__ ("movl %1, %0\n\t"
                          "addl %2, %0\n\t"
                          "movl %0, %%eax"
                          : "=r" (res)
                          : "r" (a), "r" (b)
                          : "%eax"
    );
    return res;
}

int add9(int a, int b)

{

int res;

* The clobber list tells gcc which registers(or memory) are changed by the asm,

* but not listed as an output.

__asm__ __volatile__ ("movl %1, %0\n\t"

"addl %2, %0\n\t"

"movl %0, %%eax"

: "=r" (res)

: "r" (a), "r" (b)

: "%eax"

);

return res;

}

As commented inline: The clobber list tells gcc which registers(or memory) are changed by the asm, but not listed as an output. Now gcc does not use %eax as a candidate of general registers any more. gcc can also generate code to preserve(push onto stack) registers in clobber list if necessary.

Writing UTF-8 String Using ofstream in C++

Posted on August 21, 2013 by gonwan — No Comments ↓

I’ve googled a lot to find the answer. But none really solve the problem simply and gracefully, even on stackoverflow. So we’ll do ourselves here 🙂

Actually, std::string supports operation using multibytes characters. This is the base of our solution:

static const char g_cs[] = "\xE4\xBD\xA0\xE5\xA5\xBD";

bool test_std_string()
{
    ofstream ofs("a.txt");
    ofs << string(g_cs) << endl;
    ofs.close();
    string s;
    ifstream ifs("a.txt");
    ifs >> s;
#if _WIN32
    wstring ws = utf8_to_ucs2(s);
    MessageBoxW(NULL, ws.c_str(), L"test_std_string", MB_OK);
#else
    cout << s << endl;
#endif
    return true;
}

static const char g_cs[] = "\xE4\xBD\xA0\xE5\xA5\xBD";

bool test_std_string()

{

ofstream ofs("a.txt");

ofs << string(g_cs) << endl;

ofs.close();

string s;

ifstream ifs("a.txt");

ifs >> s;

#if _WIN32

wstring ws = utf8_to_ucs2(s);

MessageBoxW(NULL, ws.c_str(), L"test_std_string", MB_OK);

#else

cout << s << endl;

#endif

return true;

}

g_cs is a Chinese word(“你好” which means hello) encoded in UTF-8. The code works under both Windows(WinXP+VS2005) and Linux(Ubuntu12.04+gcc4.6). You may wanna open a.txt to check whether the string is correctly written.

NOTE: Under Linux, we print the string directly since the default console encoding is UTF-8, and we can view the string. While under Window, the console DOES NOT support UTF-8(codepage 65001) encoding. Printing to it simply causes typo. We just convert it to a std::wstring and use MessageBox() API to check the result. I will cover the encoding issue in windows console in my next post, maybe.

I began to investigate the problem, since I cannot find a solution to read/write a UTF-8 string to XML file using boost::property_tree. Actually, it’s a bug and is already fixed in boost 1.47 and later versions. Unfortunately, Ubuntu 12.04 came with boost 1.46.1. When reading non-ASCII characters, some bytes are incorrectly skipped. The failure function is boost::property_tree::detail::rapidxml::internal::get_index(). My test code looks like:

static const char g_xml[] = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
    "<aaa>\n"
        "<bbb>\xE4\xBD\xA0\xE5\xA5\xBD</bbb>\n"
        "<bbb>\xE7\xA5\x9E\xE9\xA9\xAC</bbb>\n"
    "</aaa>\n";

bool test_boost_ptree()
{
    /* write to file */
    FILE *f = fopen("a.xml", "w");
    fwrite(g_xml, sizeof(g_xml)-1, 1, f);
    fclose(f);
    /* read and modify */
    const char cstr[] = "\xE4\xB8\xB8\xE5\xAD\x90\xE9\x85\xB1";
    try {
        boost::property_tree::ptree pt;
        int flags = boost::property_tree::xml_parser::trim_whitespace;
        boost::property_tree::read_xml("a.xml", pt, flags, std::locale());
        boost::property_tree::ptree pt2 = pt.get_child("aaa").add("bbb", string(cstr));
        boost::property_tree::xml_writer_settings<char> settings(' ', 2);
        boost::property_tree::write_xml("b.xml", pt, std::locale(), settings);
    } catch (boost::property_tree::xml_parser_error &) {
        return false;
    } catch (boost::property_tree::ptree_bad_path &) {
        return false;
    }
    /* read again */
    try {
        boost::property_tree::ptree pt;
        int flags = boost::property_tree::xml_parser::trim_whitespace;
        boost::property_tree::read_xml("b.xml", pt, flags, std::locale());
        string s = pt.get<string>("aaa.bbb");
#if _WIN32
        wstring ws = utf8_to_ucs2(s);
        MessageBoxW(NULL, ws.c_str(), L"test_boost_ptree", MB_OK);
#else
        cout << s << endl;
#endif
    } catch (boost::property_tree::xml_parser_error &) {
        return false;
    } catch (boost::property_tree::ptree_bad_path &) {
        return false;
    }
    return true;
}

static const char g_xml[] = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"

"<aaa>\n"

"<bbb>\xE4\xBD\xA0\xE5\xA5\xBD</bbb>\n"

"<bbb>\xE7\xA5\x9E\xE9\xA9\xAC</bbb>\n"

"</aaa>\n";

bool test_boost_ptree()

{

/* write to file */

FILE *f = fopen("a.xml", "w");

fwrite(g_xml, sizeof(g_xml)-1, 1, f);

fclose(f);

/* read and modify */

const char cstr[] = "\xE4\xB8\xB8\xE5\xAD\x90\xE9\x85\xB1";

try {

boost::property_tree::ptree pt;

int flags = boost::property_tree::xml_parser::trim_whitespace;

boost::property_tree::read_xml("a.xml", pt, flags, std::locale());

boost::property_tree::ptree pt2 = pt.get_child("aaa").add("bbb", string(cstr));

boost::property_tree::xml_writer_settings<char> settings(' ', 2);

boost::property_tree::write_xml("b.xml", pt, std::locale(), settings);

} catch (boost::property_tree::xml_parser_error &) {

return false;

} catch (boost::property_tree::ptree_bad_path &) {

return false;

}

/* read again */

try {

boost::property_tree::ptree pt;

int flags = boost::property_tree::xml_parser::trim_whitespace;

boost::property_tree::read_xml("b.xml", pt, flags, std::locale());

string s = pt.get<string>("aaa.bbb");

#if _WIN32

wstring ws = utf8_to_ucs2(s);

MessageBoxW(NULL, ws.c_str(), L"test_boost_ptree", MB_OK);

#else

cout << s << endl;

#endif

} catch (boost::property_tree::xml_parser_error &) {

return false;

} catch (boost::property_tree::ptree_bad_path &) {

return false;

}

return true;

}

Almost the same structure with the previous function. And finally the utf8_to_ucs2() function:

#ifdef _WIN32
wstring utf8_to_ucs2(const string &input)
{
    wchar_t *pwc;
    wstring output;
    int len = MultiByteToWideChar(CP_UTF8, 0, input.c_str(), (int)input.length(), NULL, 0);
    pwc = new wchar_t[len+1];
    ZeroMemory(pwc, sizeof(wchar_t)*(len+1));
    MultiByteToWideChar(CP_UTF8, 0, input.c_str(), (int)input.length(), pwc, len+1);
    output = pwc;
    delete pwc;
    return output;
}
#endif

#ifdef _WIN32

wstring utf8_to_ucs2(const string &input)

{

wchar_t *pwc;

wstring output;

int len = MultiByteToWideChar(CP_UTF8, 0, input.c_str(), (int)input.length(), NULL, 0);

pwc = new wchar_t[len+1];

ZeroMemory(pwc, sizeof(wchar_t)*(len+1));

MultiByteToWideChar(CP_UTF8, 0, input.c_str(), (int)input.length(), pwc, len+1);

output = pwc;

delete pwc;

return output;

}

#endif

Please add header files yourselves to make it compile 🙂

Pointer-to-function Vs Pointer-to-member-function

Posted on June 25, 2013 by gonwan — No Comments ↓

There’s a series of C++ FAQ: http://www.parashift.com/c++-faq/pointers-to-members.html. And one of it addresses some technical details:

Pointers to member functions and pointers to data are not necessarily represented in the same way. A pointer to a member function might be a data structure rather than a single pointer. Think about it: if it’s pointing at a virtual function, it might not actually be pointing at a statically resolvable pile of code, so it might not even be a normal address – it might be a different data structure of some sort.

Let’s write some demo code:

#include <iostream>
using namespace std;

class A
{
public:
    void fun1();
};

class B
{
public:
    virtual void fun2();
};

class C : public A, public B
{
};

int main()
{
    typedef void (A::*pfnAFun1)();
    typedef void (B::*pfnBFun2)();
    typedef void (C::*pfnCFun1)();
    typedef void (C::*pfnCFun2)();
    cout << "sizeof(pfnAFun1) is " << sizeof(pfnAFun1) << endl;
    cout << "sizeof(pfnBFun2) is " << sizeof(pfnBFun2) << endl;
    cout << "sizeof(pfnCFun1) is " << sizeof(pfnCFun1) << endl;
    cout << "sizeof(pfnCFun2) is " << sizeof(pfnCFun2) << endl;
    //delete cout;
    return 0;
}

#include <iostream>

using namespace std;

class A

{

public:

void fun1();

};

class B

{

public:

virtual void fun2();

};

class C : public A, public B

{

};

int main()

{

typedef void (A::*pfnAFun1)();

typedef void (B::*pfnBFun2)();

typedef void (C::*pfnCFun1)();

typedef void (C::*pfnCFun2)();

cout << "sizeof(pfnAFun1) is " << sizeof(pfnAFun1) << endl;

cout << "sizeof(pfnBFun2) is " << sizeof(pfnBFun2) << endl;

cout << "sizeof(pfnCFun1) is " << sizeof(pfnCFun1) << endl;

cout << "sizeof(pfnCFun2) is " << sizeof(pfnCFun2) << endl;

//delete cout;

return 0;

}

Output on WinXP/VS2005:

sizeof(pfnAFun1) is 4
sizeof(pfnBFun2) is 4
sizeof(pfnCFun1) is 8
sizeof(pfnCFun2) is 8

sizeof(pfnAFun1) is 4

sizeof(pfnBFun2) is 4

sizeof(pfnCFun1) is 8

sizeof(pfnCFun2) is 8

Output on Ubuntu12.04/gcc4.6:

sizeof(pfnAFun1) is 8
sizeof(pfnBFun2) is 8
sizeof(pfnCFun1) is 8
sizeof(pfnCFun2) is 8

sizeof(pfnAFun1) is 8

sizeof(pfnBFun2) is 8

sizeof(pfnCFun1) is 8

sizeof(pfnCFun2) is 8

Both are run on 32bit systems. Sizes of pointer-to-member-function are not confirmed to be equal to sizeof(void *). And you are not allowed to convert a pointer-to-member-function to void * or a plain pointer-to-function type. Only equality comparisons(=, !=) are supported. Thus, it can be used to implement the “Safe Bool Idiom” here: http://www.artima.com/cppsource/safebool.html.

Perhaps the best-known use of this technique comes from the C++ Standard the conversion that allows the state of iostreams to be queried uses it.

if (std::cin) {  // Is the stream ok?
}

1 2	if (std::cin) { // Is the stream ok? }

But at least in gcc/libstdc++, its implementation uses bool and void * conversion operations. In basic_ios class:

public:
    //@{
    /**
     *  @brief  The quick-and-easy status check.
     *
     *  This allows you to write constructs such as
     *  "if (!a_stream) ..." and "while (a_stream) ..."
     */
    operator void*() const
    { return this->fail() ? 0 : const_cast<basic_ios*>(this); }

    bool
    operator!() const
    { return this->fail(); }
    //@}

public:

//@{

/**

* @brief The quick-and-easy status check.

* This allows you to write constructs such as

* "if (!a_stream) ..." and "while (a_stream) ..."

operator void*() const

{ return this->fail() ? 0 : const_cast<basic_ios*>(this); }

bool

operator!() const

{ return this->fail(); }

//@}

This allows std::cout to even be deleted. Hmmm…Just write down some details here.