No OneTemporary
Actions

Authored By

Unknown

Size

34 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/src/Hyphenator.cpp b/src/Hyphenator.cpp
	index 4fe5e2c..b5e5da4 100644
	--- a/src/Hyphenator.cpp
	+++ b/src/Hyphenator.cpp
	@@ -1,262 +1,262 @@
	/* libhyphenate: A TeX-like hyphenation algorithm.
	* Copyright (C) 2007 Steve Wolter
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*
	* If you have any questions, feel free to contact me:
	* http://swolter.sdf1.org
	**/
	#include "Hyphenator.h"
	#include <iostream>
	#include <fstream>
	#include <vector>
	#include <map>
	#include <memory>
	#include <ctype.h>
	#include <stdlib.h>

	#include "HyphenationRule.h"
	#include "HyphenationTree.h"

	#include "UTF8Functions.h"

	#define UTF8_MAX 6

	using namespace std;
	using namespace Hyphenate;

	/** The hyphenation table parser. */
	static auto_ptr<HyphenationTree> read_hyphenation_table(const char *filename) {
	ifstream i(filename, fstream::in);
	auto_ptr<HyphenationTree> output(new HyphenationTree());
	output->loadPatterns(i);

	return output;
	}

	/** Build a hyphenator from the patterns in the file provided. */
	Hyphenate::Hyphenator::Hyphenator(const char *filename) {
	dictionary = read_hyphenation_table(filename);
	}

	Hyphenator::~Hyphenator() {}

	std::string Hyphenator::hyphenate
	(const std::string &word, const std::string &hyphen)
	{
	string result;
	unsigned int word_start = -1;

	/* Go through the input. All non-alpha characters are added to the
	* output immediately, and words are hyphenated and then added. */
	for (unsigned int i = 0; i < word.size(); i++) {
	/* Skip UTF-8 tail bytes. */
	if ((word[i] & 0xC0) == 0x80)
	;
	else {
	bool isalpha = utf32IsAlpha(utf8GetCharacter(word.c_str() + i));

	if (word_start == string::npos && isalpha)
	word_start = i;
	else if (word_start != string::npos && !isalpha) {
	result +=
	hyphenate_word(word.substr(word_start, i - word_start), hyphen);
	word_start = string::npos;
	}
	}

	if (word_start == string::npos)
	result += word[i];
	}
	if (word_start != string::npos)
	result += hyphenate_word(word.substr(word_start), hyphen);

	return result;
	}

	std::string Hyphenator::hyphenate_word
	(const std::string &word, const std::string &hyphen)
	{
	auto_ptr<vector<const HyphenationRule*> > rules =
	dictionary->applyPatterns(word);

	/* Build our result string. Of course, we _could_ insert characters in
	* w, but that would be highly inefficient. */
	string result;

	int acc_skip = 0;
	for (unsigned int i = 0; i < word.size(); i++) {
	if ((*rules)[i] != NULL)
	acc_skip += (*rules)[i]->apply(result, hyphen);

	if (acc_skip > 0)
	acc_skip--;
	else
	result += word[i];
	}

	return result;
	}

	pair<std::string, std::string> Hyphenator::hyphenate_at
	(const std::string &src, const std::string &hyphen, size_t len)
	{
	/* First of all, find the word which needs to be hyphenated. */
	const char *cur = src.c_str();
	for (unsigned int i = 0; i < len; i++)
	cur = utf8GoToNextCharacter(cur);

	const char *next = cur;
	- if (!utf32IsSpace(utf8GetCharacter(next)))
	+ if (!utf32IsBreakableSpace(utf8GetCharacter(next)))
	next = utf8GoToNextCharacter(next);
	pair<string, string> result;

	- if (utf32IsSpace(utf8GetCharacter(next))) {
	+ if (utf32IsBreakableSpace(utf8GetCharacter(next))) {
	/* We are lucky: There is a space we can hyphenate at. */

	/* We leave no spaces at the end of a line: */
	- while (utf32IsSpace(utf8GetCharacter(cur)))
	+ while (utf32IsBreakableSpace(utf8GetCharacter(cur)))
	cur = utf8GoToPrevCharacter(cur);
	int len = cur - src.c_str() + 1;
	result.first = src.substr(0, len);

	/* Neither do we leave spaces at the beginning of the next. */
	- while (utf32IsSpace(utf8GetCharacter(next)))
	+ while (utf32IsBreakableSpace(utf8GetCharacter(next)))
	next = utf8GoToNextCharacter(next);
	result.second = src.substr(next - src.c_str());

	} else {
	/* We can hyphenate at hyphenation points in words or at spaces, whatever
	* comes earlier. We will check all words here in the loop. */
	const char *border = cur;
	while (true) {
	/* Find the start of a word first. */
	bool in_word = utf32IsAlpha(utf8GetCharacter(cur));
	const char *word_start = NULL;
	while (cur > src.c_str()) {
	cur = utf8GoToPrevCharacter(cur);
	int ch = utf8GetCharacter(cur);

	if (in_word && (!utf32IsAlpha(ch))) {
	/* If we have a word, try hyphenating it.*/
	word_start = utf8GoToNextCharacter(cur);
	break;
	- } else if (utf32IsSpace(ch)) {
	+ } else if (utf32IsBreakableSpace(ch)) {
	break;
	} else if (!in_word && utf32IsAlpha(ch))
	in_word = true;

	if (cur == src.c_str() && in_word)
	word_start = cur;
	}

	/* There are two reasons why we may have left the previous loop with-
	* out result:
	* Either because our word goes all the way to the first character,
	* or because we found whitespace. */
	/* In the first case, there is nothing really hyphenateable. */
	if (word_start != NULL) {
	/* We have the start of a word, now look for the character after
	* the end. */
	const char *word_end = word_start;
	while (utf32IsAlpha(utf8GetCharacter(word_end)))
	word_end = utf8GoToNextCharacter(word_end);

	/* Build the substring consisting of the word. */
	string word;
	for (const char *i = word_start; i < word_end; i++)
	word += *i;

	/* Hyphenate the word. */
	auto_ptr<vector<const HyphenationRule*> > rules =
	dictionary->applyPatterns(word);

	/* Determine the index of the latest hyphenation that will still
	* fit. */
	int latest_possible_hyphenation = -1;
	int earliest_hyphenation = -1;
	for (int i = 0; i < (int)rules->size(); i++)
	if ((*rules)[i] != NULL) {
	if (earliest_hyphenation == -1)
	earliest_hyphenation = i;
	if (word_start + i +
	(*rules)[i]->spaceNeededPreHyphen() + hyphen.length()
	<= border)
	{
	if (i > latest_possible_hyphenation) {
	latest_possible_hyphenation = i;
	}
	} else
	break;
	}

	bool have_space = false;
	for (const char *i = src.c_str(); i <= word_start;
	i = utf8GoToNextCharacter(i))
	- if (utf32IsSpace(utf8GetCharacter(i))) {
	+ if (utf32IsBreakableSpace(utf8GetCharacter(i))) {
	have_space = true;
	break;
	}
	if (latest_possible_hyphenation == -1 && !have_space)
	latest_possible_hyphenation = earliest_hyphenation;

	/* Apply the best hyphenation, if any. */
	if (latest_possible_hyphenation >= 0) {
	int i = latest_possible_hyphenation;
	result.first = src.substr(0, word_start - src.c_str() + i);
	(*rules)[i]->apply_first(result.first, hyphen);
	int skip = (*rules)[i]->apply_second(result.second);
	const char *after_hyphen = word_start + i + skip;
	result.second += string(after_hyphen);
	break;
	}
	}

	if (cur == src.c_str()) {
	/* We cannot hyphenate at all, so leave the first block standing
	* and move to its end. */
	const char *eol = cur;
	- while (*eol != 0 && !utf32IsSpace(utf8GetCharacter(eol)))
	+ while (*eol != 0 && !utf32IsBreakableSpace(utf8GetCharacter(eol)))
	eol = utf8GoToNextCharacter(eol);

	result.first = src.substr(0, eol - src.c_str() + 1);
	- while (*eol != 0 && utf32IsSpace(utf8GetCharacter(eol)))
	+ while (*eol != 0 && utf32IsBreakableSpace(utf8GetCharacter(eol)))
	eol = utf8GoToNextCharacter(eol);
	result.second = string(eol);
	break;
	- } else if (utf32IsSpace(utf8GetCharacter(cur))) {
	+ } else if (utf32IsBreakableSpace(utf8GetCharacter(cur))) {
	/* eol is the end of the previous line, bol the start of the
	* next. */
	const char eol = cur, bol = cur;
	- while (utf32IsSpace(utf8GetCharacter(eol)))
	+ while (utf32IsBreakableSpace(utf8GetCharacter(eol)))
	eol = utf8GoToPrevCharacter(eol);
	- while (utf32IsSpace(utf8GetCharacter(bol)))
	+ while (utf32IsBreakableSpace(utf8GetCharacter(bol)))
	bol = utf8GoToNextCharacter(bol);

	result.first = src.substr(0, eol - src.c_str() + 1);
	result.second = string(bol);
	break;
	}
	}
	}

	return result;
	}

	std::auto_ptr<std::vector<const HyphenationRule*> >
	Hyphenate::Hyphenator::applyHyphenationRules(const std::string& word)
	{
	return dictionary->applyPatterns(word);
	}
	diff --git a/src/UTF8Functions.cpp b/src/UTF8Functions.cpp
	index 2c34b61..0e9ae46 100644
	--- a/src/UTF8Functions.cpp
	+++ b/src/UTF8Functions.cpp
	@@ -1,512 +1,523 @@
	/*
	* Copyright (C) 2019 Me and My Shadow
	*
	* This file is part of Me and My Shadow.
	*
	* Me and My Shadow is free software: you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, either version 3 of the License, or
	* (at your option) any later version.
	*
	* Me and My Shadow is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
	*/

	#include <stdio.h>
	#include <math.h>
	#include <string.h>
	#include <algorithm>
	#include <string>
	#include "UTF8Functions.h"

	// A helper function to read a character from utf8 string
	// s: the string
	// p [in,out]: the position
	// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
	int utf8ReadForward(const char* s, int& p) {
	int ch = (unsigned char)s[p];
	if (ch < 0x80){
	if (ch) p++;
	return ch;
	} else if (ch < 0xC0){
	// skip invalid characters
	while (((unsigned char)s[p] & 0xC0) == 0x80) p++;
	return -1;
	} else if (ch < 0xE0){
	int c2 = (unsigned char)s[++p];
	if ((c2 & 0xC0) != 0x80) return -1;

	ch = ((ch & 0x1F) << 6) \| (c2 & 0x3F);
	p++;
	return ch;
	} else if (ch < 0xF0){
	int c2 = (unsigned char)s[++p];
	if ((c2 & 0xC0) != 0x80) return -1;
	int c3 = (unsigned char)s[++p];
	if ((c3 & 0xC0) != 0x80) return -1;

	ch = ((ch & 0xF) << 12) \| ((c2 & 0x3F) << 6) \| (c3 & 0x3F);
	p++;
	return ch;
	} else if (ch < 0xF8){
	int c2 = (unsigned char)s[++p];
	if ((c2 & 0xC0) != 0x80) return -1;
	int c3 = (unsigned char)s[++p];
	if ((c3 & 0xC0) != 0x80) return -1;
	int c4 = (unsigned char)s[++p];
	if ((c4 & 0xC0) != 0x80) return -1;

	ch = ((ch & 0x7) << 18) \| ((c2 & 0x3F) << 12) \| ((c3 & 0x3F) << 6) \| (c4 & 0x3F);
	if (ch >= 0x110000) ch = -1;
	p++;
	return ch;
	} else {
	p++;
	return -1;
	}
	}

	// A helper function to read a character backward from utf8 string (experimental)
	// s: the string
	// p [in,out]: the position
	// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
	int utf8ReadBackward(const char* s, int& p) {
	if (p <= 0) return 0;

	do {
	p--;
	} while (p > 0 && ((unsigned char)s[p] & 0xC0) == 0x80);

	int tmp = p;
	return utf8ReadForward(s, tmp);
	}

	const char* utf8GoToNextCharacter(const char* s) {
	if (*s == 0) return s;
	do {
	s++;
	} while (((unsigned char)(*s) & 0xC0) == 0x80);
	return s;
	}

	const char* utf8GoToPrevCharacter(const char* s) {
	do {
	s--;
	} while (((unsigned char)(*s) & 0xC0) == 0x80);
	return s;
	}

	bool utf32IsSpace(int ch) {
	- //ripped from the output of glib-2.60.0
	switch (ch) {
	case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: case 0xA0: case 0x1680:
	case 0x2028: case 0x2029: case 0x202F: case 0x205F: case 0x3000:
	return true;
	default:
	- return (ch >= 0x2000 && ch <= 0x200A);
	+ return (ch >= 0x2000 && ch <= 0x200B);
	+ }
	+}
	+
	+bool utf32IsBreakableSpace(int ch) {
	+ switch (ch) {
	+ case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: /* case 0xA0: */ case 0x1680:
	+ case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: /* case 0x2007: */
	+ case 0x2008: case 0x2009: case 0x200A: case 0x200B:
	+ case 0x2028: case 0x2029: /* case 0x202F: */ case 0x205F: case 0x3000:
	+ return true;
	+ default:
	+ return false;
	}
	}

	bool utf32IsAlpha(int ch) {
	//ripped from the output of glib-2.60.0 (only a subset)

	static const int ranges_65_247[] = {
	65, 26,
	97, 26,
	170, 1,
	181, 1,
	186, 1,
	192, 23,
	216, 31,
	};
	static const int ranges_248_751[] = {
	248, 458,
	710, 12,
	736, 5,
	748, 1,
	750, 1,
	};
	static const int ranges_880_1328[] = {
	880, 5,
	886, 2,
	890, 4,
	895, 1,
	902, 1,
	904, 3,
	908, 1,
	910, 20,
	931, 83,
	1015, 139,
	1162, 166,
	};

	// skip 0x0530 - 0x1CFF

	static const int ranges_7424_8189[] = {
	7424, 192,
	7680, 278,
	7960, 6,
	7968, 38,
	8008, 6,
	8016, 8,
	8025, 1,
	8027, 1,
	8029, 1,
	8031, 31,
	8064, 53,
	8118, 7,
	8126, 1,
	8130, 3,
	8134, 7,
	8144, 4,
	8150, 6,
	8160, 13,
	8178, 3,
	8182, 7,
	};

	// skip 0x2000 - 0x10FFFF

	const int *ranges = NULL;
	int rangeSize = 0;

	#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
	#define CHECK_RANGE(LPS,LPE) \
	else if (ch < LPS) { \
	} else if (ch < LPE) { \
	ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
	}

	if (false) {}
	CHECK_RANGE(65, 247)
	CHECK_RANGE(248, 751)
	CHECK_RANGE(880, 1328)
	CHECK_RANGE(7424, 8189)

	for (int i = 0; i < rangeSize; i += 2) {
	const int lps = ranges[i];
	const int lpe = lps + ranges[i + 1];
	if (ch < lps) break;
	else if (ch < lpe) {
	return true;
	}
	}

	return false;
	}

	bool utf32IsCJK(int ch) {
	return (ch >= 0x002E80 && ch <= 0x009FFF) /* CJK scripts and symbols */
	\|\| (ch >= 0x00F900 && ch <= 0x00FAFF) /* CJK Compatibility Ideographs */
	\|\| (ch >= 0x00FE30 && ch <= 0x00FE4F) /* CJK Compatibility Forms */
	\|\| (ch >= 0x020000 && ch <= 0x03FFFF) /* Supplementary Ideographic Plane & Tertiary Ideographic Plane */
	;
	}

	bool utf32IsCJKEndingPunctuation(int ch) {
	//ripped from M$ Word
	switch (ch) {
	case 0x21: case 0x25: case 0x29: case 0x2C: case 0x2E: case 0x3A: case 0x3B: case 0x3E: case 0x3F: case 0x5D: case 0x7D:
	case 0xA2: case 0xA8: case 0xB0: case 0xB7:
	case 0x2C7: case 0x2C9:
	case 0x2015: case 0x2016: case 0x2019: case 0x201D: case 0x2026: case 0x2030: case 0x2032: case 0x2033: case 0x203A: case 0x2103: case 0x2236:
	case 0x3001: case 0x3002: case 0x3003: case 0x3009: case 0x300B: case 0x300D: case 0x300F: case 0x3011: case 0x3015: case 0x3017: case 0x301E:
	case 0x0FE36: case 0x0FE3A: case 0x0FE3E: case 0x0FE40: case 0x0FE44: case 0x0FE5A: case 0x0FE5C: case 0x0FE5E:
	case 0x0FF01: case 0x0FF02: case 0x0FF05: case 0x0FF07: case 0x0FF09: case 0x0FF0C: case 0x0FF0E: case 0x0FF1A: case 0x0FF1B: case 0x0FF1F:
	case 0x0FF3D: case 0x0FF40: case 0x0FF5C: case 0x0FF5D: case 0x0FF5E: case 0x0FFE0:
	return true;
	default:
	return false;
	}
	}

	bool utf32IsCJKStartingPunctuation(int ch) {
	//ripped from M$ Word
	switch (ch) {
	case 0x24: case 0x28: case 0x5B: case 0x7B:
	case 0xA3: case 0xA5: case 0xB7:
	case 0x2018: case 0x201C:
	case 0x3008: case 0x300A: case 0x300C: case 0x300E: case 0x3010: case 0x3014: case 0x3016: case 0x301D:
	case 0x0FE59: case 0x0FE5B: case 0x0FE5D:
	case 0x0FF04: case 0x0FF08: case 0x0FF0E:
	case 0x0FF3B: case 0x0FF5B: case 0x0FFE1: case 0x0FFE5:
	return true;
	default:
	return false;
	}
	}

	int utf32ToLower(int ch) {
	//ripped from the output of glib-2.60.0

	static const int ranges_65_223[] = {
	65, 26, 32,
	192, 23, 32,
	216, 7, 32,
	};
	static const int ranges_304_504[] = {
	304, 1, -199,
	376, 1, -121,
	385, 1, 210,
	390, 1, 206,
	393, 2, 205,
	398, 1, 79,
	399, 1, 202,
	400, 1, 203,
	403, 1, 205,
	404, 1, 207,
	406, 1, 211,
	407, 1, 209,
	412, 1, 211,
	413, 1, 213,
	415, 1, 214,
	422, 1, 218,
	425, 1, 218,
	430, 1, 218,
	433, 2, 217,
	439, 1, 219,
	452, 1, 2,
	455, 1, 2,
	458, 1, 2,
	497, 1, 2,
	502, 1, -97,
	503, 1, -56,
	};
	static const int ranges_544_582[] = {
	544, 1, -130,
	570, 1, 10795,
	573, 1, -163,
	574, 1, 10792,
	579, 1, -195,
	580, 1, 69,
	581, 1, 71,
	};
	static const int ranges_895_1018[] = {
	895, 1, 116,
	902, 1, 38,
	904, 3, 37,
	908, 1, 64,
	910, 2, 63,
	913, 17, 32,
	931, 9, 32,
	975, 1, 8,
	1012, 1, -60,
	1017, 1, -7,
	};
	static const int ranges_1021_1367[] = {
	1021, 3, -130,
	1024, 16, 80,
	1040, 32, 32,
	1216, 1, 15,
	1329, 38, 48,
	};
	static const int ranges_4256_5110[] = {
	4256, 38, 7264,
	4295, 1, 7264,
	4301, 1, 7264,
	5024, 80, 38864,
	5104, 6, 8,
	};
	static const int ranges_7312_8499[] = {
	7312, 43, -3008,
	7357, 3, -3008,
	7838, 1, -7615,
	7944, 8, -8,
	7960, 6, -8,
	7976, 8, -8,
	7992, 8, -8,
	8008, 6, -8,
	8025, 1, -8,
	8027, 1, -8,
	8029, 1, -8,
	8031, 1, -8,
	8040, 8, -8,
	8072, 8, -8,
	8088, 8, -8,
	8104, 8, -8,
	8120, 2, -8,
	8122, 2, -74,
	8124, 1, -9,
	8136, 4, -86,
	8140, 1, -9,
	8152, 2, -8,
	8154, 2, -100,
	8168, 2, -8,
	8170, 2, -112,
	8172, 1, -7,
	8184, 2, -128,
	8186, 2, -126,
	8188, 1, -9,
	8486, 1, -7517,
	8490, 1, -8383,
	8491, 1, -8262,
	8498, 1, 28,
	};
	static const int ranges_11264_11392[] = {
	11264, 47, 48,
	11362, 1, -10743,
	11363, 1, -3814,
	11364, 1, -10727,
	11373, 1, -10780,
	11374, 1, -10749,
	11375, 1, -10783,
	11376, 1, -10782,
	11390, 2, -10815,
	};
	static const int ranges_42877_42932[] = {
	42877, 1, -35332,
	42893, 1, -42280,
	42922, 1, -42308,
	42923, 1, -42319,
	42924, 1, -42315,
	42925, 1, -42305,
	42926, 1, -42308,
	42928, 1, -42258,
	42929, 1, -42282,
	42930, 1, -42261,
	42931, 1, 928,
	};
	static const int ranges_65313_125218[] = {
	65313, 26, 32,
	66560, 40, 40,
	66736, 36, 40,
	68736, 51, 64,
	71840, 32, 32,
	93760, 32, 32,
	125184, 34, 34,
	};

	static const int ranges2_256_440[] = {
	256, 302,
	306, 310,
	313, 327,
	330, 374,
	377, 381,
	386, 388,
	391, 391,
	395, 395,
	401, 401,
	408, 408,
	416, 420,
	423, 423,
	428, 428,
	431, 431,
	435, 437,
	440, 440,
	};
	static const int ranges2_444_590[] = {
	444, 444,
	453, 453,
	456, 456,
	459, 475,
	478, 494,
	498, 500,
	504, 542,
	546, 562,
	571, 571,
	577, 577,
	582, 590,
	};
	static const int ranges2_880_1326[] = {
	880, 882,
	886, 886,
	984, 1006,
	1015, 1015,
	1018, 1018,
	1120, 1152,
	1162, 1214,
	1217, 1229,
	1232, 1326,
	};
	static const int ranges2_7680_11506[] = {
	7680, 7828,
	7840, 7934,
	8579, 8579,
	11360, 11360,
	11367, 11371,
	11378, 11378,
	11381, 11381,
	11392, 11490,
	11499, 11501,
	11506, 11506,
	};
	static const int ranges2_42560_42936[] = {
	42560, 42604,
	42624, 42650,
	42786, 42798,
	42802, 42862,
	42873, 42875,
	42878, 42886,
	42891, 42891,
	42896, 42898,
	42902, 42920,
	42932, 42936,
	};

	const int ranges = NULL, ranges2 = NULL;
	int rangeSize = 0, range2Size = 0;

	#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
	#define CHECK_RANGE(LPS,LPE) \
	else if (ch < LPS) { \
	} else if (ch < LPE) { \
	ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
	}

	#define RANGE2(LPS,LPE) ranges2_##LPS##_##LPE
	#define CHECK_RANGE2(LPS,LPE) \
	else if (ch < LPS) { \
	} else if (ch <= LPE) { \
	ranges2 = RANGE2(LPS,LPE); range2Size = sizeof(RANGE2(LPS,LPE)) / sizeof(RANGE2(LPS,LPE)[0]); \
	}

	if (false) {}
	CHECK_RANGE(65, 223)
	CHECK_RANGE(304, 504)
	CHECK_RANGE(544, 582)
	CHECK_RANGE(895, 1018)
	CHECK_RANGE(1021, 1367)
	CHECK_RANGE(4256, 5110)
	CHECK_RANGE(7312, 8499)
	CHECK_RANGE(11264, 11392)
	CHECK_RANGE(42877, 42932)
	CHECK_RANGE(65313, 125218)

	for (int i = 0; i < rangeSize; i += 3) {
	const int lps = ranges[i];
	const int lpe = lps + ranges[i + 1];
	if (ch < lps) break;
	else if (ch < lpe) {
	return ch + ranges[i + 2];
	}
	}

	if (false) {}
	CHECK_RANGE2(256, 440)
	CHECK_RANGE2(444, 590)
	CHECK_RANGE2(880, 1326)
	CHECK_RANGE2(7680, 11506)
	CHECK_RANGE2(42560, 42936)

	for (int i = 0; i < range2Size; i += 2) {
	const int lps = ranges2[i];
	const int lpe = ranges2[i + 1];
	if (ch < lps) break;
	else if (ch <= lpe) {
	if (((ch - lps) & 0x1) == 0) return ch + 1;
	else break;
	}
	}

	#undef RANGE
	#undef RANGE2
	#undef CHECK_RANGE
	#undef CHECK_RANGE2

	return ch;
	}
	diff --git a/src/UTF8Functions.h b/src/UTF8Functions.h
	index c34345e..4ab2e68 100644
	--- a/src/UTF8Functions.h
	+++ b/src/UTF8Functions.h
	@@ -1,162 +1,163 @@
	/*
	* Copyright (C) 2019 Me and My Shadow
	*
	* This file is part of Me and My Shadow.
	*
	* Me and My Shadow is free software: you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, either version 3 of the License, or
	* (at your option) any later version.
	*
	* Me and My Shadow is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
	*/

	#ifndef UTF8FUNCTIONS_H
	#define UTF8FUNCTIONS_H

	// A helper function to read a character from utf8 string and advance the pointer
	// s: the string
	// p [in,out]: the position
	// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
	int utf8ReadForward(const char* s, int& p);

	// A helper function to read a character backward from utf8 string and advance the pointer (experimental)
	// s: the string
	// p [in,out]: the position
	// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
	int utf8ReadBackward(const char* s, int& p);

	// A helper function to read the first character from utf8 string
	// s: the string
	// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
	// NOTE: Consider utf8ReadForward() instead if you want to read multiple characters
	inline int utf8GetCharacter(const char* s) {
	int tmp = 0;
	return utf8ReadForward(s, tmp);
	}

	// A helper function to advance the pointer in a utf8 string to next character
	// s: the pointer
	// return value: the new pointer
	// WARNING: there is no sanity check!
	const char* utf8GoToNextCharacter(const char* s);

	// A helper function to advance the pointer in a utf8 string to previous character
	// s: the pointer
	// return value: the new pointer
	// WARNING: there is no sanity check!
	const char* utf8GoToPrevCharacter(const char* s);

	bool utf32IsSpace(int ch);
	+bool utf32IsBreakableSpace(int ch);
	bool utf32IsAlpha(int ch);
	bool utf32IsCJK(int ch);
	bool utf32IsCJKEndingPunctuation(int ch); // check if the character should't be at start of line in CJK mode
	bool utf32IsCJKStartingPunctuation(int ch); // check if the character should't be at end of line in CJK mode
	int utf32ToLower(int ch);

	#define U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
	for(size_t I=0;I<M;I++){ \
	int CH=(unsigned char)STR[I]; \
	if(CH<0x80){ \
	}else if(CH<0xC0){ \
	CH=INVALID_CH; \
	}else if(CH<0xE0){ \
	if(I+1>=M) CH=INVALID_CH; \
	else{ \
	int c2=(unsigned char)STR[I+1]; \
	if((c2&0xC0)!=0x80) CH=INVALID_CH; \
	else{ \
	CH=((CH & 0x1F)<<6) \| (c2 & 0x3F); \
	I++; \
	} \
	} \
	}else if(CH<0xF0){ \
	if(I+2>=M) CH=INVALID_CH; \
	else{ \
	int c2=(unsigned char)STR[I+1]; \
	int c3=(unsigned char)STR[I+2]; \
	if((c2&0xC0)!=0x80 \|\| (c3&0xC0)!=0x80) CH=INVALID_CH; \
	else{ \
	CH=((CH & 0xF)<<12) \| ((c2 & 0x3F)<<6) \| (c3 & 0x3F); \
	I+=2; \
	} \
	} \
	}else if(CH<0xF8){ \
	if(I+3>=M) CH=INVALID_CH; \
	else{ \
	int c2=(unsigned char)STR[I+1]; \
	int c3=(unsigned char)STR[I+2]; \
	int c4=(unsigned char)STR[I+3]; \
	if((c2&0xC0)!=0x80 \|\| (c3&0xC0)!=0x80 \|\| (c4&0xC0)!=0x80) CH=INVALID_CH; \
	else{ \
	CH=((CH & 0x7)<<18) \| ((c2 & 0x3F)<<12) \| ((c3 & 0x3F)<<6) \| (c4 & 0x3F); \
	if(CH>=0x110000) CH=INVALID_CH; \
	else I+=3; \
	} \
	} \
	}else{ \
	CH=INVALID_CH; \
	}

	#define U8STRING_FOR_EACH_CHARACTER_DO_END() }

	#define U8_ENCODE(CH,OPERATION) \
	if(CH<0x80){ \
	OPERATION(CH); \
	}else if(CH<0x800){ \
	OPERATION(0xC0 \| (CH>>6)); \
	OPERATION(0x80 \| (CH & 0x3F)); \
	}else if(CH<0x10000){ \
	OPERATION(0xE0 \| (CH>>12)); \
	OPERATION(0x80 \| ((CH>>6) & 0x3F)); \
	OPERATION(0x80 \| (CH & 0x3F)); \
	}else{ \
	OPERATION(0xF0 \| (CH>>18)); \
	OPERATION(0x80 \| ((CH>>12) & 0x3F)); \
	OPERATION(0x80 \| ((CH>>6) & 0x3F)); \
	OPERATION(0x80 \| (CH & 0x3F)); \
	}

	#define U16STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
	for(size_t I=0;I<M;I++){ \
	int CH=(unsigned short)(STR[I]); \
	if(CH<0xD800){ \
	}else if(CH<0xDC00){ \
	/* lead surrogate */ \
	I++; \
	if(I>=M) CH=INVALID_CH; \
	else{ \
	int c2=(unsigned short)STR[I]; \
	if(CH>=0xDC00 && CH<0xE000){ \
	/* trail surrogate */ \
	CH=0x10000 + (((CH & 0x3FF)<<10) \| (c2 & 0x3FF)); \
	}else{ \
	/* invalid */ \
	CH=INVALID_CH; \
	I--; \
	} \
	} \
	}else if(CH<0xE000){ \
	/* invalid trail surrogate */ \
	CH=INVALID_CH; \
	}

	#define U16STRING_FOR_EACH_CHARACTER_DO_END() }

	#define U16_ENCODE(CH,OPERATION) \
	if(CH<0x10000){ \
	OPERATION(CH); \
	}else{ \
	OPERATION(0xD800 \| ((CH-0x10000)>>10)); \
	OPERATION(0xDC00 \| (CH & 0x3FF)); \
	}

	const int REPLACEMENT_CHARACTER = 0x00FFFD;

	#endif
	diff --git a/src/WordWrapper.cpp b/src/WordWrapper.cpp
	index ceae42b..3bc20cd 100644
	--- a/src/WordWrapper.cpp
	+++ b/src/WordWrapper.cpp
	@@ -1,329 +1,329 @@
	/*
	* Copyright (C) 2019 Me and My Shadow
	*
	* This file is part of Me and My Shadow.
	*
	* Me and My Shadow is free software: you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, either version 3 of the License, or
	* (at your option) any later version.
	*
	* Me and My Shadow is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
	*/

	#include "WordWrapper.h"
	#include "HyphenationManager.h"
	#include "HyphenationRule.h"
	#include "UTF8Functions.h"
	#include <algorithm>

	#include <assert.h>

	#include <SDL_ttf_fontfallback.h>

	int WordWrapper::getTextWidth(const std::string& s) {
	if (s.empty()) return 0;

	int w = 0;

	if (font) {
	TTF_SizeUTF8(font, s.c_str(), &w, NULL);
	} else {
	const size_t m = s.size();

	U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(s, i, m, ch, REPLACEMENT_CHARACTER);
	w++;
	U8STRING_FOR_EACH_CHARACTER_DO_END();
	}

	return w;
	}

	int WordWrapper::getGlyphWidth(int ch) {
	if (font) {
	int w = 0;
	TTF_GlyphMetrics(font, ch, NULL, NULL, NULL, NULL, &w);
	return w;
	} else {
	return 1;
	}
	}

	WordWrapper::WordWrapper()
	: font(NULL)
	, maxWidth(0)
	, wordWrap(false)
	, reserveHyperlinks(false)
	{
	}

	WordWrapper::~WordWrapper() {
	}

	bool WordWrapper::isReserved(const std::string& word) {
	if (reserveHyperlinks) {
	const char *s = word.c_str();
	const size_t m = word.size();
	for (size_t i = 0; i < m; i++) {
	// we only support http or https
	if ((s[i] == 'H' \|\| s[i] == 'h')
	&& (s[i + 1] == 'T' \|\| s[i + 1] == 't')
	&& (s[i + 2] == 'T' \|\| s[i + 2] == 't')
	&& (s[i + 3] == 'P' \|\| s[i + 3] == 'p'))
	{
	if (s[i + 4] == ':' && s[i + 5] == '/' && s[i + 6] == '/') {
	// http
	return true;
	} else if ((s[i + 4] == 'S' \|\| s[i + 4] == 's') && s[i + 5] == ':' && s[i + 6] == '/' && s[i + 7] == '/') {
	// https
	return true;
	}
	}
	}
	}

	for (const std::string& s : reservedWords) {
	if (word == s) return true;
	}

	for (const std::string& s : reservedFragments) {
	if (word.find(s) != std::string::npos) return true;
	}

	return false;
	}

	int WordWrapper::addString(std::vector<std::string>& output, const std::string& input) {
	int mw = 0;
	std::string line;

	for (char c : input) {
	if (c == '\r') {
	} else if (c == '\n') {
	mw = std::max(addLine(output, line), mw);
	line.clear();
	} else {
	line.push_back(c);
	}
	}

	return std::max(addLine(output, line), mw);
	}

	// Add a word to line, output the line only if the line+newWord doesn't fit the width and in this case put the newWord to the line.
	// Returns the maximal width required for this string.
	int WordWrapper::addWord(std::vector<std::string>& output, std::string& line, int& lineWidth, const std::string& spaces, const std::string& nonSpaces) {
	int w1 = getTextWidth(spaces);

	{
	int w2 = getTextWidth(nonSpaces);

	//Check if it fits into current line.
	if (lineWidth + w1 + w2 <= maxWidth) {
	line += spaces + nonSpaces;
	lineWidth += w1 + w2;
	return lineWidth;
	}

	//Now it doesn't fit into current line.

	//Check if we should skip the hyphenation.
	if (hyphen.empty() \|\| isReserved(nonSpaces)) {
	if (line.empty()) {
	//A line consists of at least one word, so we append it forcefully.
	line += spaces + nonSpaces;
	lineWidth += w1 + w2;
	return lineWidth;
	} else {
	//We output current line.
	output.push_back(line);

	//And add a new line consisting of new word (but we remove spaces in it).
	line = nonSpaces;
	int mw = std::max(lineWidth, w2);
	lineWidth = w2;
	return mw;
	}
	}
	}

	auto hm = getHyphenationManager();
	auto hyphenator = hyphenatorLanguage.empty() ? hm->getHyphenator() : hm->getHyphenator(hyphenatorLanguage);
	auto rules = hyphenator->applyHyphenationRules(nonSpaces);

	const size_t m = nonSpaces.size();

	std::string tmp, prev;
	int skip = 0, prevSkip = 0, prevWidth = 0;
	size_t prevIndex = 0;
	int mw = lineWidth;

	for (size_t i = 0;; i++) {
	const Hyphenate::HyphenationRule rule = (i < m) ? (rules)[i] : NULL;
	if (rule \|\| i == m) {
	std::string tmp2 = tmp;
	if (rule) rule->apply_first(tmp2, hyphen);

	int newWidth = getTextWidth(tmp2);

	/*//debug
	printf("%-5d %s\n", newWidth, tmp2.c_str());*/

	//Check if we should output current line directly.
	if (lineWidth + w1 + newWidth > maxWidth && prev.empty() && !line.empty()) {
	//We output current line.
	output.push_back(line);
	mw = std::max(lineWidth, mw);

	line.clear();
	lineWidth = 0;
	w1 = 0;
	}

	//Check if the line is still too long.
	if (lineWidth + w1 + newWidth > maxWidth) {
	//Check if we have previous available hyphenation
	if (prev.empty()) {
	//Line is empty, we have to append it forcefully.
	assert(line.empty());

	if (w1 > 0) line += spaces;
	line += tmp2;
	if (i < m) {
	output.push_back(line);
	mw = std::max(lineWidth, mw);
	line.clear();
	lineWidth = 0;
	w1 = 0;
	} else {
	lineWidth += w1 + newWidth;
	mw = std::max(lineWidth, mw);
	}

	//Update buffer
	tmp.clear();
	if (rule) skip += rule->apply_second(tmp);
	} else {
	//We use previous available hyphenation
	if (w1 > 0) line += spaces;
	output.push_back(line + prev);
	mw = std::max(lineWidth + w1 + prevWidth, mw);
	line.clear();
	lineWidth = 0;
	w1 = 0;

	//Rewind
	prev.clear();
	prevWidth = 0;
	skip = prevSkip;
	i = prevIndex;

	//Update buffer
	tmp.clear();
	rule = (*rules)[i];
	assert(rule != NULL);
	skip += rule->apply_second(tmp);
	}
	} else if (i == m) {
	//Output last part
	if (w1 > 0) line += spaces;
	line += tmp2;
	lineWidth += w1 + newWidth;
	mw = std::max(lineWidth, mw);
	} else if (newWidth > prevWidth) {
	//Update prev hyphenation
	prev = tmp2;
	prevSkip = skip;
	prevWidth = newWidth;
	prevIndex = i;
	}
	}

	if (i >= m) break;

	if (skip > 0) skip--;
	else tmp.push_back(nonSpaces[i]);
	}

	return mw;
	}

	int WordWrapper::addLine(std::vector<std::string>& output, const std::string& input) {
	if (!wordWrap) {
	//Word wrap is not enabled, simply add it to output
	output.push_back(input);
	return getTextWidth(input);
	}

	const size_t m = input.size();

	std::string spaces, nonSpaces, line;
	int lineWidth = 0, mw = 0;

	bool prevIsCJK = false, prevIsCJKStarting = false;

	U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(input, i, m, ch, REPLACEMENT_CHARACTER);

	//A word consists of a sequence of white spaces and a sequence of non-white-spaces.

	//For CJK should only read one CJK character (possibly with a punctuation mark)

	if (ch == '\r') {
	- } else if (utf32IsSpace(ch)) {
	+ } else if (utf32IsBreakableSpace(ch)) {
	prevIsCJK = false;
	prevIsCJKStarting = false;
	if (!nonSpaces.empty()) {
	mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
	spaces.clear();
	nonSpaces.clear();
	}
	U8_ENCODE(ch, spaces.push_back);
	} else {
	bool isCJK = utf32IsCJK(ch);
	bool isCJKStarting = utf32IsCJKStartingPunctuation(ch);
	if (prevIsCJK) {
	//Output the CJK character immediately unless current character can't be at start of line
	if (!utf32IsCJKEndingPunctuation(ch)) {
	mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
	spaces.clear();
	nonSpaces.clear();
	}
	} else if (isCJK && !nonSpaces.empty()) {
	//Output the existing non-CJK character immediately unless it can't be at end of line
	if (!prevIsCJKStarting) {
	mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
	spaces.clear();
	nonSpaces.clear();
	}
	}
	prevIsCJK = isCJK;
	prevIsCJKStarting = isCJKStarting;
	U8_ENCODE(ch, nonSpaces.push_back);
	}

	U8STRING_FOR_EACH_CHARACTER_DO_END();

	//FIXME: Here we temporarily ignore trailing spaces
	if (!nonSpaces.empty()) {
	mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
	}

	//Output the remaining text.
	output.push_back(line);

	return mw;
	}

	int WordWrapper::addLines(std::vector<std::string>& output, const std::vector<std::string>& input) {
	int mw = 0;
	for (const std::string& s : input) {
	mw = std::max(addLine(output, s), mw);
	}
	return mw;
	}

File Metadata

Mime Type: text/x-diff
Expires: Sat, May 9, 8:04 PM (6 d, 23 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 62875
Default Alt Text: (34 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions