Paste number 77871: first actual whirl at dfa based utf-8 decoder in c

Index of paste annotations: 1 | 2

Paste number 77871: first actual whirl at dfa based utf-8 decoder in c
Pasted by: [bjoern]
When:2 years, 10 months ago
Share:Tweet this! | http://paste.lisp.org/+1O33
Channel:#swhack
Paste contents:
Raw Source | XML | Display As
#include <stdio.h>

typedef unsigned __int8 uint8_t;
typedef unsigned __int32 uint32_t;

static const uint8_t d1[] = {
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,
  0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,
  0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
  0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x0c, 0x0c, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xae, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e,
  0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x8e, 0x7e, 0x7e,
  0x6f, 0x5f, 0x5f, 0x5f, 0x4f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
};

static const uint8_t d2[] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 8, 7, 6, 4, 5, 4, 3, 2, 1, 0, 0, 0,
  0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 8, 7, 6, 4, 5, 4, 3, 2, 1, 0, 0, 0,
};

void decode(uint8_t* s) {

  uint8_t byte, stat, data, type;
  uint32_t unic;

  stat = 9;
  unic = 0;

  while (byte = *s++) {
    data = d1[ byte ];
    type = data >> 4;
    stat = d2[ stat << 4 | type ];
    byte = byte ^ (data << 4);
    unic = unic << 6 | byte;

    if (stat == 1)
      printf("%06x\n", unic);
  }

}

int main() {
  char* s = "\xe2\x82\xac";

  decode((uint8_t*)s);

  return 0;  
}

Annotations for this paste:

Annotation number 1: slightly twiddled version
Pasted by: [bjoern]
When:2 years, 10 months ago
Share:Tweet this! | http://paste.lisp.org/+1O33/1
Paste contents:
Raw Source | Display As
static const uint8_t d_combined[] = {
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  070,070,070,070,070,070,070,070,070,070,070,070,070,070,070,070,
  050,050,050,050,050,050,050,050,050,050,050,050,050,050,050,050,
  030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,
  030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,
  204,204,188,188,188,188,188,188,188,188,188,188,188,188,188,188,
  188,188,188,188,188,188,188,188,188,188,188,188,188,188,188,188,
  174,158,158,158,158,158,158,158,158,158,158,158,158,142,126,126,
  111, 95, 95, 95, 79,207,207,207,207,207,207,207,207,207,207,207,
  0,1,1,1,8,7,6,4,5,4,3,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
  1,4,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,4,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,8,7,6,4,5,4,3,2,1,1,1,1,
};

void decode(uint8_t* s, uint32_t* unip) {
  uint8_t byte, stat, data;
  uint32_t unic;

  stat = 9;
  unic = 0;

  while ((byte = *s++)) {
    data  = d_combined[ byte ];
    stat  = d_combined[ 256 + (stat << 4) + (data >> 4) ];
    byte ^= (data << 4);
    unic  = (unic << 6) | byte;

    if (!stat) {
      *unip = unic;
      unic = 0;
    }
  }
}

Annotation number 2: table documentation
Pasted by: [bjoern]
When:2 years, 10 months ago
Share:Tweet this! | http://paste.lisp.org/+1O33/2
Paste contents:
Raw Source | Display As
/*
  The first 256 entries are tuples of 4 bit values. The lower bits
  are a mask that when xor'd with a byte removes the leading utf-8
  bits. The upper bits are a character class number. The remaining
  160 entries are a minimal deterministic finite automaton. It has
  10 states and each state has 13 character class transitions, and 
  3 unused transitions for padding reasons. When the automaton en-
  ters state zero, it has found a complete valid utf-8 code point;
  if it enters state one then the input sequence is not utf-8. The
  start state is state nine. Note the mixture of octal and decimal
  for stylistic reasons. The first 128 entries are obviously some-
  what unnecessary, but removing them would require another branch.
*/

Colorize as:
Show Line Numbers
Index of paste annotations: 1 | 2

Lisppaste pastes can be made by anyone at any time. Imagine a fearsomely comprehensive disclaimer of liability. Now fear, comprehensively.