Page MenuHomePhabricator (Chris)

No OneTemporary

Authored By
Unknown
Size
34 KB
Referenced Files
None
Subscribers
None
diff --git a/src/Hyphenator.cpp b/src/Hyphenator.cpp
index 4fe5e2c..b5e5da4 100644
--- a/src/Hyphenator.cpp
+++ b/src/Hyphenator.cpp
@@ -1,262 +1,262 @@
/* libhyphenate: A TeX-like hyphenation algorithm.
* Copyright (C) 2007 Steve Wolter
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* If you have any questions, feel free to contact me:
* http://swolter.sdf1.org
**/
#include "Hyphenator.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <memory>
#include <ctype.h>
#include <stdlib.h>
#include "HyphenationRule.h"
#include "HyphenationTree.h"
#include "UTF8Functions.h"
#define UTF8_MAX 6
using namespace std;
using namespace Hyphenate;
/** The hyphenation table parser. */
static auto_ptr<HyphenationTree> read_hyphenation_table(const char *filename) {
ifstream i(filename, fstream::in);
auto_ptr<HyphenationTree> output(new HyphenationTree());
output->loadPatterns(i);
return output;
}
/** Build a hyphenator from the patterns in the file provided. */
Hyphenate::Hyphenator::Hyphenator(const char *filename) {
dictionary = read_hyphenation_table(filename);
}
Hyphenator::~Hyphenator() {}
std::string Hyphenator::hyphenate
(const std::string &word, const std::string &hyphen)
{
string result;
unsigned int word_start = -1;
/* Go through the input. All non-alpha characters are added to the
* output immediately, and words are hyphenated and then added. */
for (unsigned int i = 0; i < word.size(); i++) {
/* Skip UTF-8 tail bytes. */
if ((word[i] & 0xC0) == 0x80)
;
else {
bool isalpha = utf32IsAlpha(utf8GetCharacter(word.c_str() + i));
if (word_start == string::npos && isalpha)
word_start = i;
else if (word_start != string::npos && !isalpha) {
result +=
hyphenate_word(word.substr(word_start, i - word_start), hyphen);
word_start = string::npos;
}
}
if (word_start == string::npos)
result += word[i];
}
if (word_start != string::npos)
result += hyphenate_word(word.substr(word_start), hyphen);
return result;
}
std::string Hyphenator::hyphenate_word
(const std::string &word, const std::string &hyphen)
{
auto_ptr<vector<const HyphenationRule*> > rules =
dictionary->applyPatterns(word);
/* Build our result string. Of course, we _could_ insert characters in
* w, but that would be highly inefficient. */
string result;
int acc_skip = 0;
for (unsigned int i = 0; i < word.size(); i++) {
if ((*rules)[i] != NULL)
acc_skip += (*rules)[i]->apply(result, hyphen);
if (acc_skip > 0)
acc_skip--;
else
result += word[i];
}
return result;
}
pair<std::string, std::string> Hyphenator::hyphenate_at
(const std::string &src, const std::string &hyphen, size_t len)
{
/* First of all, find the word which needs to be hyphenated. */
const char *cur = src.c_str();
for (unsigned int i = 0; i < len; i++)
cur = utf8GoToNextCharacter(cur);
const char *next = cur;
- if (!utf32IsSpace(utf8GetCharacter(next)))
+ if (!utf32IsBreakableSpace(utf8GetCharacter(next)))
next = utf8GoToNextCharacter(next);
pair<string, string> result;
- if (utf32IsSpace(utf8GetCharacter(next))) {
+ if (utf32IsBreakableSpace(utf8GetCharacter(next))) {
/* We are lucky: There is a space we can hyphenate at. */
/* We leave no spaces at the end of a line: */
- while (utf32IsSpace(utf8GetCharacter(cur)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(cur)))
cur = utf8GoToPrevCharacter(cur);
int len = cur - src.c_str() + 1;
result.first = src.substr(0, len);
/* Neither do we leave spaces at the beginning of the next. */
- while (utf32IsSpace(utf8GetCharacter(next)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(next)))
next = utf8GoToNextCharacter(next);
result.second = src.substr(next - src.c_str());
} else {
/* We can hyphenate at hyphenation points in words or at spaces, whatever
* comes earlier. We will check all words here in the loop. */
const char *border = cur;
while (true) {
/* Find the start of a word first. */
bool in_word = utf32IsAlpha(utf8GetCharacter(cur));
const char *word_start = NULL;
while (cur > src.c_str()) {
cur = utf8GoToPrevCharacter(cur);
int ch = utf8GetCharacter(cur);
if (in_word && (!utf32IsAlpha(ch))) {
/* If we have a word, try hyphenating it.*/
word_start = utf8GoToNextCharacter(cur);
break;
- } else if (utf32IsSpace(ch)) {
+ } else if (utf32IsBreakableSpace(ch)) {
break;
} else if (!in_word && utf32IsAlpha(ch))
in_word = true;
if (cur == src.c_str() && in_word)
word_start = cur;
}
/* There are two reasons why we may have left the previous loop with-
* out result:
* Either because our word goes all the way to the first character,
* or because we found whitespace. */
/* In the first case, there is nothing really hyphenateable. */
if (word_start != NULL) {
/* We have the start of a word, now look for the character after
* the end. */
const char *word_end = word_start;
while (utf32IsAlpha(utf8GetCharacter(word_end)))
word_end = utf8GoToNextCharacter(word_end);
/* Build the substring consisting of the word. */
string word;
for (const char *i = word_start; i < word_end; i++)
word += *i;
/* Hyphenate the word. */
auto_ptr<vector<const HyphenationRule*> > rules =
dictionary->applyPatterns(word);
/* Determine the index of the latest hyphenation that will still
* fit. */
int latest_possible_hyphenation = -1;
int earliest_hyphenation = -1;
for (int i = 0; i < (int)rules->size(); i++)
if ((*rules)[i] != NULL) {
if (earliest_hyphenation == -1)
earliest_hyphenation = i;
if (word_start + i +
(*rules)[i]->spaceNeededPreHyphen() + hyphen.length()
<= border)
{
if (i > latest_possible_hyphenation) {
latest_possible_hyphenation = i;
}
} else
break;
}
bool have_space = false;
for (const char *i = src.c_str(); i <= word_start;
i = utf8GoToNextCharacter(i))
- if (utf32IsSpace(utf8GetCharacter(i))) {
+ if (utf32IsBreakableSpace(utf8GetCharacter(i))) {
have_space = true;
break;
}
if (latest_possible_hyphenation == -1 && !have_space)
latest_possible_hyphenation = earliest_hyphenation;
/* Apply the best hyphenation, if any. */
if (latest_possible_hyphenation >= 0) {
int i = latest_possible_hyphenation;
result.first = src.substr(0, word_start - src.c_str() + i);
(*rules)[i]->apply_first(result.first, hyphen);
int skip = (*rules)[i]->apply_second(result.second);
const char *after_hyphen = word_start + i + skip;
result.second += string(after_hyphen);
break;
}
}
if (cur == src.c_str()) {
/* We cannot hyphenate at all, so leave the first block standing
* and move to its end. */
const char *eol = cur;
- while (*eol != 0 && !utf32IsSpace(utf8GetCharacter(eol)))
+ while (*eol != 0 && !utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToNextCharacter(eol);
result.first = src.substr(0, eol - src.c_str() + 1);
- while (*eol != 0 && utf32IsSpace(utf8GetCharacter(eol)))
+ while (*eol != 0 && utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToNextCharacter(eol);
result.second = string(eol);
break;
- } else if (utf32IsSpace(utf8GetCharacter(cur))) {
+ } else if (utf32IsBreakableSpace(utf8GetCharacter(cur))) {
/* eol is the end of the previous line, bol the start of the
* next. */
const char *eol = cur, *bol = cur;
- while (utf32IsSpace(utf8GetCharacter(eol)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToPrevCharacter(eol);
- while (utf32IsSpace(utf8GetCharacter(bol)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(bol)))
bol = utf8GoToNextCharacter(bol);
result.first = src.substr(0, eol - src.c_str() + 1);
result.second = string(bol);
break;
}
}
}
return result;
}
std::auto_ptr<std::vector<const HyphenationRule*> >
Hyphenate::Hyphenator::applyHyphenationRules(const std::string& word)
{
return dictionary->applyPatterns(word);
}
diff --git a/src/UTF8Functions.cpp b/src/UTF8Functions.cpp
index 2c34b61..0e9ae46 100644
--- a/src/UTF8Functions.cpp
+++ b/src/UTF8Functions.cpp
@@ -1,512 +1,523 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <algorithm>
#include <string>
#include "UTF8Functions.h"
// A helper function to read a character from utf8 string
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadForward(const char* s, int& p) {
int ch = (unsigned char)s[p];
if (ch < 0x80){
if (ch) p++;
return ch;
} else if (ch < 0xC0){
// skip invalid characters
while (((unsigned char)s[p] & 0xC0) == 0x80) p++;
return -1;
} else if (ch < 0xE0){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
ch = ((ch & 0x1F) << 6) | (c2 & 0x3F);
p++;
return ch;
} else if (ch < 0xF0){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
int c3 = (unsigned char)s[++p];
if ((c3 & 0xC0) != 0x80) return -1;
ch = ((ch & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
p++;
return ch;
} else if (ch < 0xF8){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
int c3 = (unsigned char)s[++p];
if ((c3 & 0xC0) != 0x80) return -1;
int c4 = (unsigned char)s[++p];
if ((c4 & 0xC0) != 0x80) return -1;
ch = ((ch & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
if (ch >= 0x110000) ch = -1;
p++;
return ch;
} else {
p++;
return -1;
}
}
// A helper function to read a character backward from utf8 string (experimental)
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadBackward(const char* s, int& p) {
if (p <= 0) return 0;
do {
p--;
} while (p > 0 && ((unsigned char)s[p] & 0xC0) == 0x80);
int tmp = p;
return utf8ReadForward(s, tmp);
}
const char* utf8GoToNextCharacter(const char* s) {
if (*s == 0) return s;
do {
s++;
} while (((unsigned char)(*s) & 0xC0) == 0x80);
return s;
}
const char* utf8GoToPrevCharacter(const char* s) {
do {
s--;
} while (((unsigned char)(*s) & 0xC0) == 0x80);
return s;
}
bool utf32IsSpace(int ch) {
- //ripped from the output of glib-2.60.0
switch (ch) {
case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: case 0xA0: case 0x1680:
case 0x2028: case 0x2029: case 0x202F: case 0x205F: case 0x3000:
return true;
default:
- return (ch >= 0x2000 && ch <= 0x200A);
+ return (ch >= 0x2000 && ch <= 0x200B);
+ }
+}
+
+bool utf32IsBreakableSpace(int ch) {
+ switch (ch) {
+ case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: /* case 0xA0: */ case 0x1680:
+ case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: /* case 0x2007: */
+ case 0x2008: case 0x2009: case 0x200A: case 0x200B:
+ case 0x2028: case 0x2029: /* case 0x202F: */ case 0x205F: case 0x3000:
+ return true;
+ default:
+ return false;
}
}
bool utf32IsAlpha(int ch) {
//ripped from the output of glib-2.60.0 (only a subset)
static const int ranges_65_247[] = {
65, 26,
97, 26,
170, 1,
181, 1,
186, 1,
192, 23,
216, 31,
};
static const int ranges_248_751[] = {
248, 458,
710, 12,
736, 5,
748, 1,
750, 1,
};
static const int ranges_880_1328[] = {
880, 5,
886, 2,
890, 4,
895, 1,
902, 1,
904, 3,
908, 1,
910, 20,
931, 83,
1015, 139,
1162, 166,
};
// skip 0x0530 - 0x1CFF
static const int ranges_7424_8189[] = {
7424, 192,
7680, 278,
7960, 6,
7968, 38,
8008, 6,
8016, 8,
8025, 1,
8027, 1,
8029, 1,
8031, 31,
8064, 53,
8118, 7,
8126, 1,
8130, 3,
8134, 7,
8144, 4,
8150, 6,
8160, 13,
8178, 3,
8182, 7,
};
// skip 0x2000 - 0x10FFFF
const int *ranges = NULL;
int rangeSize = 0;
#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
#define CHECK_RANGE(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch < LPE) { \
ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
}
if (false) {}
CHECK_RANGE(65, 247)
CHECK_RANGE(248, 751)
CHECK_RANGE(880, 1328)
CHECK_RANGE(7424, 8189)
for (int i = 0; i < rangeSize; i += 2) {
const int lps = ranges[i];
const int lpe = lps + ranges[i + 1];
if (ch < lps) break;
else if (ch < lpe) {
return true;
}
}
return false;
}
bool utf32IsCJK(int ch) {
return (ch >= 0x002E80 && ch <= 0x009FFF) /* CJK scripts and symbols */
|| (ch >= 0x00F900 && ch <= 0x00FAFF) /* CJK Compatibility Ideographs */
|| (ch >= 0x00FE30 && ch <= 0x00FE4F) /* CJK Compatibility Forms */
|| (ch >= 0x020000 && ch <= 0x03FFFF) /* Supplementary Ideographic Plane & Tertiary Ideographic Plane */
;
}
bool utf32IsCJKEndingPunctuation(int ch) {
//ripped from M$ Word
switch (ch) {
case 0x21: case 0x25: case 0x29: case 0x2C: case 0x2E: case 0x3A: case 0x3B: case 0x3E: case 0x3F: case 0x5D: case 0x7D:
case 0xA2: case 0xA8: case 0xB0: case 0xB7:
case 0x2C7: case 0x2C9:
case 0x2015: case 0x2016: case 0x2019: case 0x201D: case 0x2026: case 0x2030: case 0x2032: case 0x2033: case 0x203A: case 0x2103: case 0x2236:
case 0x3001: case 0x3002: case 0x3003: case 0x3009: case 0x300B: case 0x300D: case 0x300F: case 0x3011: case 0x3015: case 0x3017: case 0x301E:
case 0x0FE36: case 0x0FE3A: case 0x0FE3E: case 0x0FE40: case 0x0FE44: case 0x0FE5A: case 0x0FE5C: case 0x0FE5E:
case 0x0FF01: case 0x0FF02: case 0x0FF05: case 0x0FF07: case 0x0FF09: case 0x0FF0C: case 0x0FF0E: case 0x0FF1A: case 0x0FF1B: case 0x0FF1F:
case 0x0FF3D: case 0x0FF40: case 0x0FF5C: case 0x0FF5D: case 0x0FF5E: case 0x0FFE0:
return true;
default:
return false;
}
}
bool utf32IsCJKStartingPunctuation(int ch) {
//ripped from M$ Word
switch (ch) {
case 0x24: case 0x28: case 0x5B: case 0x7B:
case 0xA3: case 0xA5: case 0xB7:
case 0x2018: case 0x201C:
case 0x3008: case 0x300A: case 0x300C: case 0x300E: case 0x3010: case 0x3014: case 0x3016: case 0x301D:
case 0x0FE59: case 0x0FE5B: case 0x0FE5D:
case 0x0FF04: case 0x0FF08: case 0x0FF0E:
case 0x0FF3B: case 0x0FF5B: case 0x0FFE1: case 0x0FFE5:
return true;
default:
return false;
}
}
int utf32ToLower(int ch) {
//ripped from the output of glib-2.60.0
static const int ranges_65_223[] = {
65, 26, 32,
192, 23, 32,
216, 7, 32,
};
static const int ranges_304_504[] = {
304, 1, -199,
376, 1, -121,
385, 1, 210,
390, 1, 206,
393, 2, 205,
398, 1, 79,
399, 1, 202,
400, 1, 203,
403, 1, 205,
404, 1, 207,
406, 1, 211,
407, 1, 209,
412, 1, 211,
413, 1, 213,
415, 1, 214,
422, 1, 218,
425, 1, 218,
430, 1, 218,
433, 2, 217,
439, 1, 219,
452, 1, 2,
455, 1, 2,
458, 1, 2,
497, 1, 2,
502, 1, -97,
503, 1, -56,
};
static const int ranges_544_582[] = {
544, 1, -130,
570, 1, 10795,
573, 1, -163,
574, 1, 10792,
579, 1, -195,
580, 1, 69,
581, 1, 71,
};
static const int ranges_895_1018[] = {
895, 1, 116,
902, 1, 38,
904, 3, 37,
908, 1, 64,
910, 2, 63,
913, 17, 32,
931, 9, 32,
975, 1, 8,
1012, 1, -60,
1017, 1, -7,
};
static const int ranges_1021_1367[] = {
1021, 3, -130,
1024, 16, 80,
1040, 32, 32,
1216, 1, 15,
1329, 38, 48,
};
static const int ranges_4256_5110[] = {
4256, 38, 7264,
4295, 1, 7264,
4301, 1, 7264,
5024, 80, 38864,
5104, 6, 8,
};
static const int ranges_7312_8499[] = {
7312, 43, -3008,
7357, 3, -3008,
7838, 1, -7615,
7944, 8, -8,
7960, 6, -8,
7976, 8, -8,
7992, 8, -8,
8008, 6, -8,
8025, 1, -8,
8027, 1, -8,
8029, 1, -8,
8031, 1, -8,
8040, 8, -8,
8072, 8, -8,
8088, 8, -8,
8104, 8, -8,
8120, 2, -8,
8122, 2, -74,
8124, 1, -9,
8136, 4, -86,
8140, 1, -9,
8152, 2, -8,
8154, 2, -100,
8168, 2, -8,
8170, 2, -112,
8172, 1, -7,
8184, 2, -128,
8186, 2, -126,
8188, 1, -9,
8486, 1, -7517,
8490, 1, -8383,
8491, 1, -8262,
8498, 1, 28,
};
static const int ranges_11264_11392[] = {
11264, 47, 48,
11362, 1, -10743,
11363, 1, -3814,
11364, 1, -10727,
11373, 1, -10780,
11374, 1, -10749,
11375, 1, -10783,
11376, 1, -10782,
11390, 2, -10815,
};
static const int ranges_42877_42932[] = {
42877, 1, -35332,
42893, 1, -42280,
42922, 1, -42308,
42923, 1, -42319,
42924, 1, -42315,
42925, 1, -42305,
42926, 1, -42308,
42928, 1, -42258,
42929, 1, -42282,
42930, 1, -42261,
42931, 1, 928,
};
static const int ranges_65313_125218[] = {
65313, 26, 32,
66560, 40, 40,
66736, 36, 40,
68736, 51, 64,
71840, 32, 32,
93760, 32, 32,
125184, 34, 34,
};
static const int ranges2_256_440[] = {
256, 302,
306, 310,
313, 327,
330, 374,
377, 381,
386, 388,
391, 391,
395, 395,
401, 401,
408, 408,
416, 420,
423, 423,
428, 428,
431, 431,
435, 437,
440, 440,
};
static const int ranges2_444_590[] = {
444, 444,
453, 453,
456, 456,
459, 475,
478, 494,
498, 500,
504, 542,
546, 562,
571, 571,
577, 577,
582, 590,
};
static const int ranges2_880_1326[] = {
880, 882,
886, 886,
984, 1006,
1015, 1015,
1018, 1018,
1120, 1152,
1162, 1214,
1217, 1229,
1232, 1326,
};
static const int ranges2_7680_11506[] = {
7680, 7828,
7840, 7934,
8579, 8579,
11360, 11360,
11367, 11371,
11378, 11378,
11381, 11381,
11392, 11490,
11499, 11501,
11506, 11506,
};
static const int ranges2_42560_42936[] = {
42560, 42604,
42624, 42650,
42786, 42798,
42802, 42862,
42873, 42875,
42878, 42886,
42891, 42891,
42896, 42898,
42902, 42920,
42932, 42936,
};
const int *ranges = NULL, *ranges2 = NULL;
int rangeSize = 0, range2Size = 0;
#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
#define CHECK_RANGE(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch < LPE) { \
ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
}
#define RANGE2(LPS,LPE) ranges2_##LPS##_##LPE
#define CHECK_RANGE2(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch <= LPE) { \
ranges2 = RANGE2(LPS,LPE); range2Size = sizeof(RANGE2(LPS,LPE)) / sizeof(RANGE2(LPS,LPE)[0]); \
}
if (false) {}
CHECK_RANGE(65, 223)
CHECK_RANGE(304, 504)
CHECK_RANGE(544, 582)
CHECK_RANGE(895, 1018)
CHECK_RANGE(1021, 1367)
CHECK_RANGE(4256, 5110)
CHECK_RANGE(7312, 8499)
CHECK_RANGE(11264, 11392)
CHECK_RANGE(42877, 42932)
CHECK_RANGE(65313, 125218)
for (int i = 0; i < rangeSize; i += 3) {
const int lps = ranges[i];
const int lpe = lps + ranges[i + 1];
if (ch < lps) break;
else if (ch < lpe) {
return ch + ranges[i + 2];
}
}
if (false) {}
CHECK_RANGE2(256, 440)
CHECK_RANGE2(444, 590)
CHECK_RANGE2(880, 1326)
CHECK_RANGE2(7680, 11506)
CHECK_RANGE2(42560, 42936)
for (int i = 0; i < range2Size; i += 2) {
const int lps = ranges2[i];
const int lpe = ranges2[i + 1];
if (ch < lps) break;
else if (ch <= lpe) {
if (((ch - lps) & 0x1) == 0) return ch + 1;
else break;
}
}
#undef RANGE
#undef RANGE2
#undef CHECK_RANGE
#undef CHECK_RANGE2
return ch;
}
diff --git a/src/UTF8Functions.h b/src/UTF8Functions.h
index c34345e..4ab2e68 100644
--- a/src/UTF8Functions.h
+++ b/src/UTF8Functions.h
@@ -1,162 +1,163 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef UTF8FUNCTIONS_H
#define UTF8FUNCTIONS_H
// A helper function to read a character from utf8 string and advance the pointer
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadForward(const char* s, int& p);
// A helper function to read a character backward from utf8 string and advance the pointer (experimental)
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadBackward(const char* s, int& p);
// A helper function to read the first character from utf8 string
// s: the string
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
// NOTE: Consider utf8ReadForward() instead if you want to read multiple characters
inline int utf8GetCharacter(const char* s) {
int tmp = 0;
return utf8ReadForward(s, tmp);
}
// A helper function to advance the pointer in a utf8 string to next character
// s: the pointer
// return value: the new pointer
// WARNING: there is no sanity check!
const char* utf8GoToNextCharacter(const char* s);
// A helper function to advance the pointer in a utf8 string to previous character
// s: the pointer
// return value: the new pointer
// WARNING: there is no sanity check!
const char* utf8GoToPrevCharacter(const char* s);
bool utf32IsSpace(int ch);
+bool utf32IsBreakableSpace(int ch);
bool utf32IsAlpha(int ch);
bool utf32IsCJK(int ch);
bool utf32IsCJKEndingPunctuation(int ch); // check if the character should't be at start of line in CJK mode
bool utf32IsCJKStartingPunctuation(int ch); // check if the character should't be at end of line in CJK mode
int utf32ToLower(int ch);
#define U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
for(size_t I=0;I<M;I++){ \
int CH=(unsigned char)STR[I]; \
if(CH<0x80){ \
}else if(CH<0xC0){ \
CH=INVALID_CH; \
}else if(CH<0xE0){ \
if(I+1>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
if((c2&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0x1F)<<6) | (c2 & 0x3F); \
I++; \
} \
} \
}else if(CH<0xF0){ \
if(I+2>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
int c3=(unsigned char)STR[I+2]; \
if((c2&0xC0)!=0x80 || (c3&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0xF)<<12) | ((c2 & 0x3F)<<6) | (c3 & 0x3F); \
I+=2; \
} \
} \
}else if(CH<0xF8){ \
if(I+3>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
int c3=(unsigned char)STR[I+2]; \
int c4=(unsigned char)STR[I+3]; \
if((c2&0xC0)!=0x80 || (c3&0xC0)!=0x80 || (c4&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0x7)<<18) | ((c2 & 0x3F)<<12) | ((c3 & 0x3F)<<6) | (c4 & 0x3F); \
if(CH>=0x110000) CH=INVALID_CH; \
else I+=3; \
} \
} \
}else{ \
CH=INVALID_CH; \
}
#define U8STRING_FOR_EACH_CHARACTER_DO_END() }
#define U8_ENCODE(CH,OPERATION) \
if(CH<0x80){ \
OPERATION(CH); \
}else if(CH<0x800){ \
OPERATION(0xC0 | (CH>>6)); \
OPERATION(0x80 | (CH & 0x3F)); \
}else if(CH<0x10000){ \
OPERATION(0xE0 | (CH>>12)); \
OPERATION(0x80 | ((CH>>6) & 0x3F)); \
OPERATION(0x80 | (CH & 0x3F)); \
}else{ \
OPERATION(0xF0 | (CH>>18)); \
OPERATION(0x80 | ((CH>>12) & 0x3F)); \
OPERATION(0x80 | ((CH>>6) & 0x3F)); \
OPERATION(0x80 | (CH & 0x3F)); \
}
#define U16STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
for(size_t I=0;I<M;I++){ \
int CH=(unsigned short)(STR[I]); \
if(CH<0xD800){ \
}else if(CH<0xDC00){ \
/* lead surrogate */ \
I++; \
if(I>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned short)STR[I]; \
if(CH>=0xDC00 && CH<0xE000){ \
/* trail surrogate */ \
CH=0x10000 + (((CH & 0x3FF)<<10) | (c2 & 0x3FF)); \
}else{ \
/* invalid */ \
CH=INVALID_CH; \
I--; \
} \
} \
}else if(CH<0xE000){ \
/* invalid trail surrogate */ \
CH=INVALID_CH; \
}
#define U16STRING_FOR_EACH_CHARACTER_DO_END() }
#define U16_ENCODE(CH,OPERATION) \
if(CH<0x10000){ \
OPERATION(CH); \
}else{ \
OPERATION(0xD800 | ((CH-0x10000)>>10)); \
OPERATION(0xDC00 | (CH & 0x3FF)); \
}
const int REPLACEMENT_CHARACTER = 0x00FFFD;
#endif
diff --git a/src/WordWrapper.cpp b/src/WordWrapper.cpp
index ceae42b..3bc20cd 100644
--- a/src/WordWrapper.cpp
+++ b/src/WordWrapper.cpp
@@ -1,329 +1,329 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#include "WordWrapper.h"
#include "HyphenationManager.h"
#include "HyphenationRule.h"
#include "UTF8Functions.h"
#include <algorithm>
#include <assert.h>
#include <SDL_ttf_fontfallback.h>
int WordWrapper::getTextWidth(const std::string& s) {
if (s.empty()) return 0;
int w = 0;
if (font) {
TTF_SizeUTF8(font, s.c_str(), &w, NULL);
} else {
const size_t m = s.size();
U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(s, i, m, ch, REPLACEMENT_CHARACTER);
w++;
U8STRING_FOR_EACH_CHARACTER_DO_END();
}
return w;
}
int WordWrapper::getGlyphWidth(int ch) {
if (font) {
int w = 0;
TTF_GlyphMetrics(font, ch, NULL, NULL, NULL, NULL, &w);
return w;
} else {
return 1;
}
}
WordWrapper::WordWrapper()
: font(NULL)
, maxWidth(0)
, wordWrap(false)
, reserveHyperlinks(false)
{
}
WordWrapper::~WordWrapper() {
}
bool WordWrapper::isReserved(const std::string& word) {
if (reserveHyperlinks) {
const char *s = word.c_str();
const size_t m = word.size();
for (size_t i = 0; i < m; i++) {
// we only support http or https
if ((s[i] == 'H' || s[i] == 'h')
&& (s[i + 1] == 'T' || s[i + 1] == 't')
&& (s[i + 2] == 'T' || s[i + 2] == 't')
&& (s[i + 3] == 'P' || s[i + 3] == 'p'))
{
if (s[i + 4] == ':' && s[i + 5] == '/' && s[i + 6] == '/') {
// http
return true;
} else if ((s[i + 4] == 'S' || s[i + 4] == 's') && s[i + 5] == ':' && s[i + 6] == '/' && s[i + 7] == '/') {
// https
return true;
}
}
}
}
for (const std::string& s : reservedWords) {
if (word == s) return true;
}
for (const std::string& s : reservedFragments) {
if (word.find(s) != std::string::npos) return true;
}
return false;
}
int WordWrapper::addString(std::vector<std::string>& output, const std::string& input) {
int mw = 0;
std::string line;
for (char c : input) {
if (c == '\r') {
} else if (c == '\n') {
mw = std::max(addLine(output, line), mw);
line.clear();
} else {
line.push_back(c);
}
}
return std::max(addLine(output, line), mw);
}
// Add a word to line, output the line only if the line+newWord doesn't fit the width and in this case put the newWord to the line.
// Returns the maximal width required for this string.
int WordWrapper::addWord(std::vector<std::string>& output, std::string& line, int& lineWidth, const std::string& spaces, const std::string& nonSpaces) {
int w1 = getTextWidth(spaces);
{
int w2 = getTextWidth(nonSpaces);
//Check if it fits into current line.
if (lineWidth + w1 + w2 <= maxWidth) {
line += spaces + nonSpaces;
lineWidth += w1 + w2;
return lineWidth;
}
//Now it doesn't fit into current line.
//Check if we should skip the hyphenation.
if (hyphen.empty() || isReserved(nonSpaces)) {
if (line.empty()) {
//A line consists of at least one word, so we append it forcefully.
line += spaces + nonSpaces;
lineWidth += w1 + w2;
return lineWidth;
} else {
//We output current line.
output.push_back(line);
//And add a new line consisting of new word (but we remove spaces in it).
line = nonSpaces;
int mw = std::max(lineWidth, w2);
lineWidth = w2;
return mw;
}
}
}
auto hm = getHyphenationManager();
auto hyphenator = hyphenatorLanguage.empty() ? hm->getHyphenator() : hm->getHyphenator(hyphenatorLanguage);
auto rules = hyphenator->applyHyphenationRules(nonSpaces);
const size_t m = nonSpaces.size();
std::string tmp, prev;
int skip = 0, prevSkip = 0, prevWidth = 0;
size_t prevIndex = 0;
int mw = lineWidth;
for (size_t i = 0;; i++) {
const Hyphenate::HyphenationRule *rule = (i < m) ? (*rules)[i] : NULL;
if (rule || i == m) {
std::string tmp2 = tmp;
if (rule) rule->apply_first(tmp2, hyphen);
int newWidth = getTextWidth(tmp2);
/*//debug
printf("%-5d %s\n", newWidth, tmp2.c_str());*/
//Check if we should output current line directly.
if (lineWidth + w1 + newWidth > maxWidth && prev.empty() && !line.empty()) {
//We output current line.
output.push_back(line);
mw = std::max(lineWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
}
//Check if the line is still too long.
if (lineWidth + w1 + newWidth > maxWidth) {
//Check if we have previous available hyphenation
if (prev.empty()) {
//Line is empty, we have to append it forcefully.
assert(line.empty());
if (w1 > 0) line += spaces;
line += tmp2;
if (i < m) {
output.push_back(line);
mw = std::max(lineWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
} else {
lineWidth += w1 + newWidth;
mw = std::max(lineWidth, mw);
}
//Update buffer
tmp.clear();
if (rule) skip += rule->apply_second(tmp);
} else {
//We use previous available hyphenation
if (w1 > 0) line += spaces;
output.push_back(line + prev);
mw = std::max(lineWidth + w1 + prevWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
//Rewind
prev.clear();
prevWidth = 0;
skip = prevSkip;
i = prevIndex;
//Update buffer
tmp.clear();
rule = (*rules)[i];
assert(rule != NULL);
skip += rule->apply_second(tmp);
}
} else if (i == m) {
//Output last part
if (w1 > 0) line += spaces;
line += tmp2;
lineWidth += w1 + newWidth;
mw = std::max(lineWidth, mw);
} else if (newWidth > prevWidth) {
//Update prev hyphenation
prev = tmp2;
prevSkip = skip;
prevWidth = newWidth;
prevIndex = i;
}
}
if (i >= m) break;
if (skip > 0) skip--;
else tmp.push_back(nonSpaces[i]);
}
return mw;
}
int WordWrapper::addLine(std::vector<std::string>& output, const std::string& input) {
if (!wordWrap) {
//Word wrap is not enabled, simply add it to output
output.push_back(input);
return getTextWidth(input);
}
const size_t m = input.size();
std::string spaces, nonSpaces, line;
int lineWidth = 0, mw = 0;
bool prevIsCJK = false, prevIsCJKStarting = false;
U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(input, i, m, ch, REPLACEMENT_CHARACTER);
//A word consists of a sequence of white spaces and a sequence of non-white-spaces.
//For CJK should only read one CJK character (possibly with a punctuation mark)
if (ch == '\r') {
- } else if (utf32IsSpace(ch)) {
+ } else if (utf32IsBreakableSpace(ch)) {
prevIsCJK = false;
prevIsCJKStarting = false;
if (!nonSpaces.empty()) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
U8_ENCODE(ch, spaces.push_back);
} else {
bool isCJK = utf32IsCJK(ch);
bool isCJKStarting = utf32IsCJKStartingPunctuation(ch);
if (prevIsCJK) {
//Output the CJK character immediately unless current character can't be at start of line
if (!utf32IsCJKEndingPunctuation(ch)) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
} else if (isCJK && !nonSpaces.empty()) {
//Output the existing non-CJK character immediately unless it can't be at end of line
if (!prevIsCJKStarting) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
}
prevIsCJK = isCJK;
prevIsCJKStarting = isCJKStarting;
U8_ENCODE(ch, nonSpaces.push_back);
}
U8STRING_FOR_EACH_CHARACTER_DO_END();
//FIXME: Here we temporarily ignore trailing spaces
if (!nonSpaces.empty()) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
}
//Output the remaining text.
output.push_back(line);
return mw;
}
int WordWrapper::addLines(std::vector<std::string>& output, const std::vector<std::string>& input) {
int mw = 0;
for (const std::string& s : input) {
mw = std::max(addLine(output, s), mw);
}
return mw;
}

File Metadata

Mime Type
text/x-diff
Expires
Sat, May 9, 8:04 PM (6 d, 23 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
62875
Default Alt Text
(34 KB)

Event Timeline