view f-demime/ptext_in.c @ 8:a92d0d59b669 default tip

f-demime: indicate X-backslash-escapes encoding in output
author Mychaela Falconia <falcon@freecalypso.org>
date Sat, 06 May 2023 17:00:23 +0000
parents 882d97266174
children
line wrap: on
line source

/*
 * This module implements transformations that are specific to text/plain.
 */

#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "defs.h"

extern void (*dec_outf)();
extern FILE *tempfile;
extern int text_is_utf8;

int ptext_has_backslash, ptext_has_linesplit;

static enum {
	FPS_GROUND,
	FPS_CR,
	FPS_UTF8
} first_pass_state;
static u_char utf8_buf[4];
static unsigned utf8_nbytes, utf8_wptr, unicode;
static unsigned out_line_len, trailing_newline;

static void
unit_output(str)
	char *str;
{
	unsigned newlen;

	newlen = strlen(str);
	if (out_line_len + newlen >= OUTPUT_LINE_MAX) {
		putc('\\', tempfile);
		putc('\n', tempfile);
		out_line_len = 0;
		ptext_has_linesplit = 1;
	}
	fputs(str, tempfile);
	out_line_len += newlen;
	trailing_newline = 0;
}

static void
newline_out()
{
	putc('\n', tempfile);
	out_line_len = 0;
	trailing_newline++;
	if (trailing_newline > 2)
		trailing_newline = 2;
}

static void
direct_output(ch)
{
	char buf[2];

	buf[0] = ch;
	buf[1] = '\0';
	unit_output(buf);
}

static void
simple_escape(ch)
{
	char buf[3];

	buf[0] = '\\';
	buf[1] = ch;
	buf[2] = '\0';
	unit_output(buf);
}

static void
hex_escape(ch)
{
	char buf[5];

	sprintf(buf, "\\x%02X", ch);
	unit_output(buf);
}

static void
regular_byte(ch)
{
	if (ch == '\\') {
		ptext_has_backslash = 1;
		simple_escape(ch);
		return;
	}
	if (ch >= ' ' && ch <= '~') {
		direct_output(ch);
		return;
	}
	switch (ch) {
	case 0x07:
		simple_escape('a');
		return;
	case 0x08:
		simple_escape('b');
		return;
	case 0x09:
		direct_output(ch);
		return;
	case 0x0B:
		simple_escape('v');
		return;
	case 0x0C:
		simple_escape('f');
		return;
	case 0x0D:
		simple_escape('r');
		return;
	case 0x1B:
		simple_escape('e');
		return;
	}
	hex_escape(ch);
}

static int
utf8_collect()
{
	switch (utf8_nbytes) {
	case 2:
		unicode = ((utf8_buf[0] & 0x1F) << 6) | (utf8_buf[1] & 0x3F);
		return(1);
	case 3:
		unicode = ((utf8_buf[0] & 0x0F) << 12) |
			  ((utf8_buf[1] & 0x3F) << 6) | (utf8_buf[2] & 0x3F);
		if (unicode & 0xF800)
			return(1);
		else
			return(0);
	case 4:
		unicode = ((utf8_buf[0] & 0x07) << 18) |
			  ((utf8_buf[1] & 0x3F) << 12) |
			  ((utf8_buf[2] & 0x3F) << 6) | (utf8_buf[3] & 0x3F);
		if (unicode & 0x1F0000)
			return(1);
		else
			return(0);
	default:
		return(0);
	}
}

static void
unicode_out()
{
	char buf[9];

	if (unicode >= 0x10000)
		sprintf(buf, "\\U%06X", unicode);
	else
		sprintf(buf, "\\u%04X", unicode);
	unit_output(buf);
}

static void
flush_first_pass_state()
{
	unsigned n;

	switch (first_pass_state) {
	case FPS_CR:
		regular_byte('\r');
		break;
	case FPS_UTF8:
		for (n = 0; n < utf8_wptr; n++)
			regular_byte(utf8_buf[n]);
		break;
	}
	first_pass_state = FPS_GROUND;
}

static void
first_pass(ch)
{
	if (first_pass_state == FPS_CR && ch == '\n') {
		first_pass_state = FPS_GROUND;
		newline_out();
		return;
	}
	if (first_pass_state == FPS_UTF8 && ch >= 0x80 && ch <= 0xBF) {
		utf8_buf[utf8_wptr++] = ch;
		if (utf8_wptr < utf8_nbytes)
			return;
		if (utf8_collect()) {
			first_pass_state = FPS_GROUND;
			unicode_out();
			return;
		}
	}
	flush_first_pass_state();
	switch (ch) {
	case '\n':
		newline_out();
		return;
	case '\r':
		first_pass_state = FPS_CR;
		return;
	}
	if (!text_is_utf8 || ch < 0xC2 || ch > 0xF7) {
		regular_byte(ch);
		return;
	}
	first_pass_state = FPS_UTF8;
	utf8_buf[0] = ch;
	utf8_wptr = 1;
	if (ch < 0xE0)
		utf8_nbytes = 2;
	else if (ch < 0xF0)
		utf8_nbytes = 3;
	else
		utf8_nbytes = 4;
}

void
ptext_conv_init()
{
	dec_outf = first_pass;
	ptext_has_backslash = 0;
	ptext_has_linesplit = 0;
	first_pass_state = FPS_GROUND;
	out_line_len = 0;
	trailing_newline = 1;
}

void
ptext_conv_finish()
{
	flush_first_pass_state();
	while (trailing_newline < 2) {
		putc('\n', tempfile);
		trailing_newline++;
	}
}