<?xml version="1.0"?>
<paste-with-annotations>
  <paste>
    <number>
      <integer>77871</integer>
    </number>
    <user>
      <string>[bjoern]</string>
    </user>
    <title>
      <string>first actual whirl at dfa based utf-8 decoder in c</string>
    </title>
    <contents>
      <string>#include &lt;stdio.h&gt;

typedef unsigned __int8 uint8_t;
typedef unsigned __int32 uint32_t;

static const uint8_t d1[] = {
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
  0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,
  0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,
  0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
  0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
  0x0c, 0x0c, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc, 0xbc,
  0xae, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x9e,
  0x9e, 0x9e, 0x9e, 0x9e, 0x9e, 0x8e, 0x7e, 0x7e,
  0x6f, 0x5f, 0x5f, 0x5f, 0x4f, 0x0f, 0x0f, 0x0f,
  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
};

static const uint8_t d2[] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 8, 7, 6, 4, 5, 4, 3, 2, 1, 0, 0, 0,
  0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 8, 7, 6, 4, 5, 4, 3, 2, 1, 0, 0, 0,
};

void decode(uint8_t* s) {

  uint8_t byte, stat, data, type;
  uint32_t unic;

  stat = 9;
  unic = 0;

  while (byte = *s++) {
    data = d1[ byte ];
    type = data &gt;&gt; 4;
    stat = d2[ stat &lt;&lt; 4 | type ];
    byte = byte ^ (data &lt;&lt; 4);
    unic = unic &lt;&lt; 6 | byte;

    if (stat == 1)
      printf(&quot;%06x\n&quot;, unic);
  }

}

int main() {
  char* s = &quot;\xe2\x82\xac&quot;;

  decode((uint8_t*)s);

  return 0;  
}
</string>
    </contents>
    <universal-time>
      <integer>3447578974</integer>
    </universal-time>
    <channel>
      <string>#swhack</string>
    </channel>
    <colorization-mode>
      <string>None</string>
    </colorization-mode>
    <maybe-spam>
      <null/>
    </maybe-spam>
    <is-unicode>
      <keyword>TRUE</keyword>
    </is-unicode>
    <deletion-requested>
      <null/>
    </deletion-requested>
    <deletion-requested-email>
      <null/>
    </deletion-requested-email>
    <expiration-time>
      <null/>
    </expiration-time>
  </paste>
  <annotation>
    <number>
      <integer>2</integer>
    </number>
    <user>
      <string>[bjoern]</string>
    </user>
    <title>
      <string>table documentation</string>
    </title>
    <contents>
      <string>/*
  The first 256 entries are tuples of 4 bit values. The lower bits
  are a mask that when xor'd with a byte removes the leading utf-8
  bits. The upper bits are a character class number. The remaining
  160 entries are a minimal deterministic finite automaton. It has
  10 states and each state has 13 character class transitions, and 
  3 unused transitions for padding reasons. When the automaton en-
  ters state zero, it has found a complete valid utf-8 code point;
  if it enters state one then the input sequence is not utf-8. The
  start state is state nine. Note the mixture of octal and decimal
  for stylistic reasons. The first 128 entries are obviously some-
  what unnecessary, but removing them would require another branch.
*/
</string>
    </contents>
    <universal-time>
      <integer>3447607196</integer>
    </universal-time>
    <channel>
      <string>#swhack</string>
    </channel>
    <colorization-mode>
      <string></string>
    </colorization-mode>
    <maybe-spam>
      <null/>
    </maybe-spam>
    <is-unicode>
      <keyword>TRUE</keyword>
    </is-unicode>
    <deletion-requested>
      <null/>
    </deletion-requested>
    <deletion-requested-email>
      <null/>
    </deletion-requested-email>
    <expiration-time>
      <null/>
    </expiration-time>
  </annotation>
  <annotation>
    <number>
      <integer>1</integer>
    </number>
    <user>
      <string>[bjoern]</string>
    </user>
    <title>
      <string>slightly twiddled version</string>
    </title>
    <contents>
      <string>static const uint8_t d_combined[] = {
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,
  070,070,070,070,070,070,070,070,070,070,070,070,070,070,070,070,
  050,050,050,050,050,050,050,050,050,050,050,050,050,050,050,050,
  030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,
  030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,030,
  204,204,188,188,188,188,188,188,188,188,188,188,188,188,188,188,
  188,188,188,188,188,188,188,188,188,188,188,188,188,188,188,188,
  174,158,158,158,158,158,158,158,158,158,158,158,158,142,126,126,
  111, 95, 95, 95, 79,207,207,207,207,207,207,207,207,207,207,207,
  0,1,1,1,8,7,6,4,5,4,3,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
  1,4,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,4,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,8,7,6,4,5,4,3,2,1,1,1,1,
};

void decode(uint8_t* s, uint32_t* unip) {
  uint8_t byte, stat, data;
  uint32_t unic;

  stat = 9;
  unic = 0;

  while ((byte = *s++)) {
    data  = d_combined[ byte ];
    stat  = d_combined[ 256 + (stat &lt;&lt; 4) + (data &gt;&gt; 4) ];
    byte ^= (data &lt;&lt; 4);
    unic  = (unic &lt;&lt; 6) | byte;

    if (!stat) {
      *unip = unic;
      unic = 0;
    }
  }
}
</string>
    </contents>
    <universal-time>
      <integer>3447604687</integer>
    </universal-time>
    <channel>
      <string>#swhack</string>
    </channel>
    <colorization-mode>
      <string></string>
    </colorization-mode>
    <maybe-spam>
      <null/>
    </maybe-spam>
    <is-unicode>
      <keyword>TRUE</keyword>
    </is-unicode>
    <deletion-requested>
      <null/>
    </deletion-requested>
    <deletion-requested-email>
      <null/>
    </deletion-requested-email>
    <expiration-time>
      <null/>
    </expiration-time>
  </annotation>
</paste-with-annotations>
