Page Menu
Home
Phabricator (Chris)
Search
Configure Global Search
Log In
Files
F116844
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Authored By
Unknown
Size
34 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/src/Hyphenator.cpp b/src/Hyphenator.cpp
index 4fe5e2c..b5e5da4 100644
--- a/src/Hyphenator.cpp
+++ b/src/Hyphenator.cpp
@@ -1,262 +1,262 @@
/* libhyphenate: A TeX-like hyphenation algorithm.
* Copyright (C) 2007 Steve Wolter
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* If you have any questions, feel free to contact me:
* http://swolter.sdf1.org
**/
#include "Hyphenator.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <memory>
#include <ctype.h>
#include <stdlib.h>
#include "HyphenationRule.h"
#include "HyphenationTree.h"
#include "UTF8Functions.h"
#define UTF8_MAX 6
using namespace std;
using namespace Hyphenate;
/** The hyphenation table parser. */
static auto_ptr<HyphenationTree> read_hyphenation_table(const char *filename) {
ifstream i(filename, fstream::in);
auto_ptr<HyphenationTree> output(new HyphenationTree());
output->loadPatterns(i);
return output;
}
/** Build a hyphenator from the patterns in the file provided. */
Hyphenate::Hyphenator::Hyphenator(const char *filename) {
dictionary = read_hyphenation_table(filename);
}
Hyphenator::~Hyphenator() {}
std::string Hyphenator::hyphenate
(const std::string &word, const std::string &hyphen)
{
string result;
unsigned int word_start = -1;
/* Go through the input. All non-alpha characters are added to the
* output immediately, and words are hyphenated and then added. */
for (unsigned int i = 0; i < word.size(); i++) {
/* Skip UTF-8 tail bytes. */
if ((word[i] & 0xC0) == 0x80)
;
else {
bool isalpha = utf32IsAlpha(utf8GetCharacter(word.c_str() + i));
if (word_start == string::npos && isalpha)
word_start = i;
else if (word_start != string::npos && !isalpha) {
result +=
hyphenate_word(word.substr(word_start, i - word_start), hyphen);
word_start = string::npos;
}
}
if (word_start == string::npos)
result += word[i];
}
if (word_start != string::npos)
result += hyphenate_word(word.substr(word_start), hyphen);
return result;
}
std::string Hyphenator::hyphenate_word
(const std::string &word, const std::string &hyphen)
{
auto_ptr<vector<const HyphenationRule*> > rules =
dictionary->applyPatterns(word);
/* Build our result string. Of course, we _could_ insert characters in
* w, but that would be highly inefficient. */
string result;
int acc_skip = 0;
for (unsigned int i = 0; i < word.size(); i++) {
if ((*rules)[i] != NULL)
acc_skip += (*rules)[i]->apply(result, hyphen);
if (acc_skip > 0)
acc_skip--;
else
result += word[i];
}
return result;
}
pair<std::string, std::string> Hyphenator::hyphenate_at
(const std::string &src, const std::string &hyphen, size_t len)
{
/* First of all, find the word which needs to be hyphenated. */
const char *cur = src.c_str();
for (unsigned int i = 0; i < len; i++)
cur = utf8GoToNextCharacter(cur);
const char *next = cur;
- if (!utf32IsSpace(utf8GetCharacter(next)))
+ if (!utf32IsBreakableSpace(utf8GetCharacter(next)))
next = utf8GoToNextCharacter(next);
pair<string, string> result;
- if (utf32IsSpace(utf8GetCharacter(next))) {
+ if (utf32IsBreakableSpace(utf8GetCharacter(next))) {
/* We are lucky: There is a space we can hyphenate at. */
/* We leave no spaces at the end of a line: */
- while (utf32IsSpace(utf8GetCharacter(cur)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(cur)))
cur = utf8GoToPrevCharacter(cur);
int len = cur - src.c_str() + 1;
result.first = src.substr(0, len);
/* Neither do we leave spaces at the beginning of the next. */
- while (utf32IsSpace(utf8GetCharacter(next)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(next)))
next = utf8GoToNextCharacter(next);
result.second = src.substr(next - src.c_str());
} else {
/* We can hyphenate at hyphenation points in words or at spaces, whatever
* comes earlier. We will check all words here in the loop. */
const char *border = cur;
while (true) {
/* Find the start of a word first. */
bool in_word = utf32IsAlpha(utf8GetCharacter(cur));
const char *word_start = NULL;
while (cur > src.c_str()) {
cur = utf8GoToPrevCharacter(cur);
int ch = utf8GetCharacter(cur);
if (in_word && (!utf32IsAlpha(ch))) {
/* If we have a word, try hyphenating it.*/
word_start = utf8GoToNextCharacter(cur);
break;
- } else if (utf32IsSpace(ch)) {
+ } else if (utf32IsBreakableSpace(ch)) {
break;
} else if (!in_word && utf32IsAlpha(ch))
in_word = true;
if (cur == src.c_str() && in_word)
word_start = cur;
}
/* There are two reasons why we may have left the previous loop with-
* out result:
* Either because our word goes all the way to the first character,
* or because we found whitespace. */
/* In the first case, there is nothing really hyphenateable. */
if (word_start != NULL) {
/* We have the start of a word, now look for the character after
* the end. */
const char *word_end = word_start;
while (utf32IsAlpha(utf8GetCharacter(word_end)))
word_end = utf8GoToNextCharacter(word_end);
/* Build the substring consisting of the word. */
string word;
for (const char *i = word_start; i < word_end; i++)
word += *i;
/* Hyphenate the word. */
auto_ptr<vector<const HyphenationRule*> > rules =
dictionary->applyPatterns(word);
/* Determine the index of the latest hyphenation that will still
* fit. */
int latest_possible_hyphenation = -1;
int earliest_hyphenation = -1;
for (int i = 0; i < (int)rules->size(); i++)
if ((*rules)[i] != NULL) {
if (earliest_hyphenation == -1)
earliest_hyphenation = i;
if (word_start + i +
(*rules)[i]->spaceNeededPreHyphen() + hyphen.length()
<= border)
{
if (i > latest_possible_hyphenation) {
latest_possible_hyphenation = i;
}
} else
break;
}
bool have_space = false;
for (const char *i = src.c_str(); i <= word_start;
i = utf8GoToNextCharacter(i))
- if (utf32IsSpace(utf8GetCharacter(i))) {
+ if (utf32IsBreakableSpace(utf8GetCharacter(i))) {
have_space = true;
break;
}
if (latest_possible_hyphenation == -1 && !have_space)
latest_possible_hyphenation = earliest_hyphenation;
/* Apply the best hyphenation, if any. */
if (latest_possible_hyphenation >= 0) {
int i = latest_possible_hyphenation;
result.first = src.substr(0, word_start - src.c_str() + i);
(*rules)[i]->apply_first(result.first, hyphen);
int skip = (*rules)[i]->apply_second(result.second);
const char *after_hyphen = word_start + i + skip;
result.second += string(after_hyphen);
break;
}
}
if (cur == src.c_str()) {
/* We cannot hyphenate at all, so leave the first block standing
* and move to its end. */
const char *eol = cur;
- while (*eol != 0 && !utf32IsSpace(utf8GetCharacter(eol)))
+ while (*eol != 0 && !utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToNextCharacter(eol);
result.first = src.substr(0, eol - src.c_str() + 1);
- while (*eol != 0 && utf32IsSpace(utf8GetCharacter(eol)))
+ while (*eol != 0 && utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToNextCharacter(eol);
result.second = string(eol);
break;
- } else if (utf32IsSpace(utf8GetCharacter(cur))) {
+ } else if (utf32IsBreakableSpace(utf8GetCharacter(cur))) {
/* eol is the end of the previous line, bol the start of the
* next. */
const char *eol = cur, *bol = cur;
- while (utf32IsSpace(utf8GetCharacter(eol)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(eol)))
eol = utf8GoToPrevCharacter(eol);
- while (utf32IsSpace(utf8GetCharacter(bol)))
+ while (utf32IsBreakableSpace(utf8GetCharacter(bol)))
bol = utf8GoToNextCharacter(bol);
result.first = src.substr(0, eol - src.c_str() + 1);
result.second = string(bol);
break;
}
}
}
return result;
}
std::auto_ptr<std::vector<const HyphenationRule*> >
Hyphenate::Hyphenator::applyHyphenationRules(const std::string& word)
{
return dictionary->applyPatterns(word);
}
diff --git a/src/UTF8Functions.cpp b/src/UTF8Functions.cpp
index 2c34b61..0e9ae46 100644
--- a/src/UTF8Functions.cpp
+++ b/src/UTF8Functions.cpp
@@ -1,512 +1,523 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <algorithm>
#include <string>
#include "UTF8Functions.h"
// A helper function to read a character from utf8 string
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadForward(const char* s, int& p) {
int ch = (unsigned char)s[p];
if (ch < 0x80){
if (ch) p++;
return ch;
} else if (ch < 0xC0){
// skip invalid characters
while (((unsigned char)s[p] & 0xC0) == 0x80) p++;
return -1;
} else if (ch < 0xE0){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
ch = ((ch & 0x1F) << 6) | (c2 & 0x3F);
p++;
return ch;
} else if (ch < 0xF0){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
int c3 = (unsigned char)s[++p];
if ((c3 & 0xC0) != 0x80) return -1;
ch = ((ch & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
p++;
return ch;
} else if (ch < 0xF8){
int c2 = (unsigned char)s[++p];
if ((c2 & 0xC0) != 0x80) return -1;
int c3 = (unsigned char)s[++p];
if ((c3 & 0xC0) != 0x80) return -1;
int c4 = (unsigned char)s[++p];
if ((c4 & 0xC0) != 0x80) return -1;
ch = ((ch & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
if (ch >= 0x110000) ch = -1;
p++;
return ch;
} else {
p++;
return -1;
}
}
// A helper function to read a character backward from utf8 string (experimental)
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadBackward(const char* s, int& p) {
if (p <= 0) return 0;
do {
p--;
} while (p > 0 && ((unsigned char)s[p] & 0xC0) == 0x80);
int tmp = p;
return utf8ReadForward(s, tmp);
}
const char* utf8GoToNextCharacter(const char* s) {
if (*s == 0) return s;
do {
s++;
} while (((unsigned char)(*s) & 0xC0) == 0x80);
return s;
}
const char* utf8GoToPrevCharacter(const char* s) {
do {
s--;
} while (((unsigned char)(*s) & 0xC0) == 0x80);
return s;
}
bool utf32IsSpace(int ch) {
- //ripped from the output of glib-2.60.0
switch (ch) {
case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: case 0xA0: case 0x1680:
case 0x2028: case 0x2029: case 0x202F: case 0x205F: case 0x3000:
return true;
default:
- return (ch >= 0x2000 && ch <= 0x200A);
+ return (ch >= 0x2000 && ch <= 0x200B);
+ }
+}
+
+bool utf32IsBreakableSpace(int ch) {
+ switch (ch) {
+ case 0x9: case 0xA: case 0xC: case 0xD: case 0x20: /* case 0xA0: */ case 0x1680:
+ case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: /* case 0x2007: */
+ case 0x2008: case 0x2009: case 0x200A: case 0x200B:
+ case 0x2028: case 0x2029: /* case 0x202F: */ case 0x205F: case 0x3000:
+ return true;
+ default:
+ return false;
}
}
bool utf32IsAlpha(int ch) {
//ripped from the output of glib-2.60.0 (only a subset)
static const int ranges_65_247[] = {
65, 26,
97, 26,
170, 1,
181, 1,
186, 1,
192, 23,
216, 31,
};
static const int ranges_248_751[] = {
248, 458,
710, 12,
736, 5,
748, 1,
750, 1,
};
static const int ranges_880_1328[] = {
880, 5,
886, 2,
890, 4,
895, 1,
902, 1,
904, 3,
908, 1,
910, 20,
931, 83,
1015, 139,
1162, 166,
};
// skip 0x0530 - 0x1CFF
static const int ranges_7424_8189[] = {
7424, 192,
7680, 278,
7960, 6,
7968, 38,
8008, 6,
8016, 8,
8025, 1,
8027, 1,
8029, 1,
8031, 31,
8064, 53,
8118, 7,
8126, 1,
8130, 3,
8134, 7,
8144, 4,
8150, 6,
8160, 13,
8178, 3,
8182, 7,
};
// skip 0x2000 - 0x10FFFF
const int *ranges = NULL;
int rangeSize = 0;
#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
#define CHECK_RANGE(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch < LPE) { \
ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
}
if (false) {}
CHECK_RANGE(65, 247)
CHECK_RANGE(248, 751)
CHECK_RANGE(880, 1328)
CHECK_RANGE(7424, 8189)
for (int i = 0; i < rangeSize; i += 2) {
const int lps = ranges[i];
const int lpe = lps + ranges[i + 1];
if (ch < lps) break;
else if (ch < lpe) {
return true;
}
}
return false;
}
bool utf32IsCJK(int ch) {
return (ch >= 0x002E80 && ch <= 0x009FFF) /* CJK scripts and symbols */
|| (ch >= 0x00F900 && ch <= 0x00FAFF) /* CJK Compatibility Ideographs */
|| (ch >= 0x00FE30 && ch <= 0x00FE4F) /* CJK Compatibility Forms */
|| (ch >= 0x020000 && ch <= 0x03FFFF) /* Supplementary Ideographic Plane & Tertiary Ideographic Plane */
;
}
bool utf32IsCJKEndingPunctuation(int ch) {
//ripped from M$ Word
switch (ch) {
case 0x21: case 0x25: case 0x29: case 0x2C: case 0x2E: case 0x3A: case 0x3B: case 0x3E: case 0x3F: case 0x5D: case 0x7D:
case 0xA2: case 0xA8: case 0xB0: case 0xB7:
case 0x2C7: case 0x2C9:
case 0x2015: case 0x2016: case 0x2019: case 0x201D: case 0x2026: case 0x2030: case 0x2032: case 0x2033: case 0x203A: case 0x2103: case 0x2236:
case 0x3001: case 0x3002: case 0x3003: case 0x3009: case 0x300B: case 0x300D: case 0x300F: case 0x3011: case 0x3015: case 0x3017: case 0x301E:
case 0x0FE36: case 0x0FE3A: case 0x0FE3E: case 0x0FE40: case 0x0FE44: case 0x0FE5A: case 0x0FE5C: case 0x0FE5E:
case 0x0FF01: case 0x0FF02: case 0x0FF05: case 0x0FF07: case 0x0FF09: case 0x0FF0C: case 0x0FF0E: case 0x0FF1A: case 0x0FF1B: case 0x0FF1F:
case 0x0FF3D: case 0x0FF40: case 0x0FF5C: case 0x0FF5D: case 0x0FF5E: case 0x0FFE0:
return true;
default:
return false;
}
}
bool utf32IsCJKStartingPunctuation(int ch) {
//ripped from M$ Word
switch (ch) {
case 0x24: case 0x28: case 0x5B: case 0x7B:
case 0xA3: case 0xA5: case 0xB7:
case 0x2018: case 0x201C:
case 0x3008: case 0x300A: case 0x300C: case 0x300E: case 0x3010: case 0x3014: case 0x3016: case 0x301D:
case 0x0FE59: case 0x0FE5B: case 0x0FE5D:
case 0x0FF04: case 0x0FF08: case 0x0FF0E:
case 0x0FF3B: case 0x0FF5B: case 0x0FFE1: case 0x0FFE5:
return true;
default:
return false;
}
}
int utf32ToLower(int ch) {
//ripped from the output of glib-2.60.0
static const int ranges_65_223[] = {
65, 26, 32,
192, 23, 32,
216, 7, 32,
};
static const int ranges_304_504[] = {
304, 1, -199,
376, 1, -121,
385, 1, 210,
390, 1, 206,
393, 2, 205,
398, 1, 79,
399, 1, 202,
400, 1, 203,
403, 1, 205,
404, 1, 207,
406, 1, 211,
407, 1, 209,
412, 1, 211,
413, 1, 213,
415, 1, 214,
422, 1, 218,
425, 1, 218,
430, 1, 218,
433, 2, 217,
439, 1, 219,
452, 1, 2,
455, 1, 2,
458, 1, 2,
497, 1, 2,
502, 1, -97,
503, 1, -56,
};
static const int ranges_544_582[] = {
544, 1, -130,
570, 1, 10795,
573, 1, -163,
574, 1, 10792,
579, 1, -195,
580, 1, 69,
581, 1, 71,
};
static const int ranges_895_1018[] = {
895, 1, 116,
902, 1, 38,
904, 3, 37,
908, 1, 64,
910, 2, 63,
913, 17, 32,
931, 9, 32,
975, 1, 8,
1012, 1, -60,
1017, 1, -7,
};
static const int ranges_1021_1367[] = {
1021, 3, -130,
1024, 16, 80,
1040, 32, 32,
1216, 1, 15,
1329, 38, 48,
};
static const int ranges_4256_5110[] = {
4256, 38, 7264,
4295, 1, 7264,
4301, 1, 7264,
5024, 80, 38864,
5104, 6, 8,
};
static const int ranges_7312_8499[] = {
7312, 43, -3008,
7357, 3, -3008,
7838, 1, -7615,
7944, 8, -8,
7960, 6, -8,
7976, 8, -8,
7992, 8, -8,
8008, 6, -8,
8025, 1, -8,
8027, 1, -8,
8029, 1, -8,
8031, 1, -8,
8040, 8, -8,
8072, 8, -8,
8088, 8, -8,
8104, 8, -8,
8120, 2, -8,
8122, 2, -74,
8124, 1, -9,
8136, 4, -86,
8140, 1, -9,
8152, 2, -8,
8154, 2, -100,
8168, 2, -8,
8170, 2, -112,
8172, 1, -7,
8184, 2, -128,
8186, 2, -126,
8188, 1, -9,
8486, 1, -7517,
8490, 1, -8383,
8491, 1, -8262,
8498, 1, 28,
};
static const int ranges_11264_11392[] = {
11264, 47, 48,
11362, 1, -10743,
11363, 1, -3814,
11364, 1, -10727,
11373, 1, -10780,
11374, 1, -10749,
11375, 1, -10783,
11376, 1, -10782,
11390, 2, -10815,
};
static const int ranges_42877_42932[] = {
42877, 1, -35332,
42893, 1, -42280,
42922, 1, -42308,
42923, 1, -42319,
42924, 1, -42315,
42925, 1, -42305,
42926, 1, -42308,
42928, 1, -42258,
42929, 1, -42282,
42930, 1, -42261,
42931, 1, 928,
};
static const int ranges_65313_125218[] = {
65313, 26, 32,
66560, 40, 40,
66736, 36, 40,
68736, 51, 64,
71840, 32, 32,
93760, 32, 32,
125184, 34, 34,
};
static const int ranges2_256_440[] = {
256, 302,
306, 310,
313, 327,
330, 374,
377, 381,
386, 388,
391, 391,
395, 395,
401, 401,
408, 408,
416, 420,
423, 423,
428, 428,
431, 431,
435, 437,
440, 440,
};
static const int ranges2_444_590[] = {
444, 444,
453, 453,
456, 456,
459, 475,
478, 494,
498, 500,
504, 542,
546, 562,
571, 571,
577, 577,
582, 590,
};
static const int ranges2_880_1326[] = {
880, 882,
886, 886,
984, 1006,
1015, 1015,
1018, 1018,
1120, 1152,
1162, 1214,
1217, 1229,
1232, 1326,
};
static const int ranges2_7680_11506[] = {
7680, 7828,
7840, 7934,
8579, 8579,
11360, 11360,
11367, 11371,
11378, 11378,
11381, 11381,
11392, 11490,
11499, 11501,
11506, 11506,
};
static const int ranges2_42560_42936[] = {
42560, 42604,
42624, 42650,
42786, 42798,
42802, 42862,
42873, 42875,
42878, 42886,
42891, 42891,
42896, 42898,
42902, 42920,
42932, 42936,
};
const int *ranges = NULL, *ranges2 = NULL;
int rangeSize = 0, range2Size = 0;
#define RANGE(LPS,LPE) ranges_##LPS##_##LPE
#define CHECK_RANGE(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch < LPE) { \
ranges = RANGE(LPS,LPE); rangeSize = sizeof(RANGE(LPS,LPE)) / sizeof(RANGE(LPS,LPE)[0]); \
}
#define RANGE2(LPS,LPE) ranges2_##LPS##_##LPE
#define CHECK_RANGE2(LPS,LPE) \
else if (ch < LPS) { \
} else if (ch <= LPE) { \
ranges2 = RANGE2(LPS,LPE); range2Size = sizeof(RANGE2(LPS,LPE)) / sizeof(RANGE2(LPS,LPE)[0]); \
}
if (false) {}
CHECK_RANGE(65, 223)
CHECK_RANGE(304, 504)
CHECK_RANGE(544, 582)
CHECK_RANGE(895, 1018)
CHECK_RANGE(1021, 1367)
CHECK_RANGE(4256, 5110)
CHECK_RANGE(7312, 8499)
CHECK_RANGE(11264, 11392)
CHECK_RANGE(42877, 42932)
CHECK_RANGE(65313, 125218)
for (int i = 0; i < rangeSize; i += 3) {
const int lps = ranges[i];
const int lpe = lps + ranges[i + 1];
if (ch < lps) break;
else if (ch < lpe) {
return ch + ranges[i + 2];
}
}
if (false) {}
CHECK_RANGE2(256, 440)
CHECK_RANGE2(444, 590)
CHECK_RANGE2(880, 1326)
CHECK_RANGE2(7680, 11506)
CHECK_RANGE2(42560, 42936)
for (int i = 0; i < range2Size; i += 2) {
const int lps = ranges2[i];
const int lpe = ranges2[i + 1];
if (ch < lps) break;
else if (ch <= lpe) {
if (((ch - lps) & 0x1) == 0) return ch + 1;
else break;
}
}
#undef RANGE
#undef RANGE2
#undef CHECK_RANGE
#undef CHECK_RANGE2
return ch;
}
diff --git a/src/UTF8Functions.h b/src/UTF8Functions.h
index c34345e..4ab2e68 100644
--- a/src/UTF8Functions.h
+++ b/src/UTF8Functions.h
@@ -1,162 +1,163 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef UTF8FUNCTIONS_H
#define UTF8FUNCTIONS_H
// A helper function to read a character from utf8 string and advance the pointer
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadForward(const char* s, int& p);
// A helper function to read a character backward from utf8 string and advance the pointer (experimental)
// s: the string
// p [in,out]: the position
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
int utf8ReadBackward(const char* s, int& p);
// A helper function to read the first character from utf8 string
// s: the string
// return value: the character readed, in utf32 format, 0 means end of string, -1 means error
// NOTE: Consider utf8ReadForward() instead if you want to read multiple characters
inline int utf8GetCharacter(const char* s) {
int tmp = 0;
return utf8ReadForward(s, tmp);
}
// A helper function to advance the pointer in a utf8 string to next character
// s: the pointer
// return value: the new pointer
// WARNING: there is no sanity check!
const char* utf8GoToNextCharacter(const char* s);
// A helper function to advance the pointer in a utf8 string to previous character
// s: the pointer
// return value: the new pointer
// WARNING: there is no sanity check!
const char* utf8GoToPrevCharacter(const char* s);
bool utf32IsSpace(int ch);
+bool utf32IsBreakableSpace(int ch);
bool utf32IsAlpha(int ch);
bool utf32IsCJK(int ch);
bool utf32IsCJKEndingPunctuation(int ch); // check if the character should't be at start of line in CJK mode
bool utf32IsCJKStartingPunctuation(int ch); // check if the character should't be at end of line in CJK mode
int utf32ToLower(int ch);
#define U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
for(size_t I=0;I<M;I++){ \
int CH=(unsigned char)STR[I]; \
if(CH<0x80){ \
}else if(CH<0xC0){ \
CH=INVALID_CH; \
}else if(CH<0xE0){ \
if(I+1>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
if((c2&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0x1F)<<6) | (c2 & 0x3F); \
I++; \
} \
} \
}else if(CH<0xF0){ \
if(I+2>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
int c3=(unsigned char)STR[I+2]; \
if((c2&0xC0)!=0x80 || (c3&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0xF)<<12) | ((c2 & 0x3F)<<6) | (c3 & 0x3F); \
I+=2; \
} \
} \
}else if(CH<0xF8){ \
if(I+3>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned char)STR[I+1]; \
int c3=(unsigned char)STR[I+2]; \
int c4=(unsigned char)STR[I+3]; \
if((c2&0xC0)!=0x80 || (c3&0xC0)!=0x80 || (c4&0xC0)!=0x80) CH=INVALID_CH; \
else{ \
CH=((CH & 0x7)<<18) | ((c2 & 0x3F)<<12) | ((c3 & 0x3F)<<6) | (c4 & 0x3F); \
if(CH>=0x110000) CH=INVALID_CH; \
else I+=3; \
} \
} \
}else{ \
CH=INVALID_CH; \
}
#define U8STRING_FOR_EACH_CHARACTER_DO_END() }
#define U8_ENCODE(CH,OPERATION) \
if(CH<0x80){ \
OPERATION(CH); \
}else if(CH<0x800){ \
OPERATION(0xC0 | (CH>>6)); \
OPERATION(0x80 | (CH & 0x3F)); \
}else if(CH<0x10000){ \
OPERATION(0xE0 | (CH>>12)); \
OPERATION(0x80 | ((CH>>6) & 0x3F)); \
OPERATION(0x80 | (CH & 0x3F)); \
}else{ \
OPERATION(0xF0 | (CH>>18)); \
OPERATION(0x80 | ((CH>>12) & 0x3F)); \
OPERATION(0x80 | ((CH>>6) & 0x3F)); \
OPERATION(0x80 | (CH & 0x3F)); \
}
#define U16STRING_FOR_EACH_CHARACTER_DO_BEGIN(STR,I,M,CH,INVALID_CH) \
for(size_t I=0;I<M;I++){ \
int CH=(unsigned short)(STR[I]); \
if(CH<0xD800){ \
}else if(CH<0xDC00){ \
/* lead surrogate */ \
I++; \
if(I>=M) CH=INVALID_CH; \
else{ \
int c2=(unsigned short)STR[I]; \
if(CH>=0xDC00 && CH<0xE000){ \
/* trail surrogate */ \
CH=0x10000 + (((CH & 0x3FF)<<10) | (c2 & 0x3FF)); \
}else{ \
/* invalid */ \
CH=INVALID_CH; \
I--; \
} \
} \
}else if(CH<0xE000){ \
/* invalid trail surrogate */ \
CH=INVALID_CH; \
}
#define U16STRING_FOR_EACH_CHARACTER_DO_END() }
#define U16_ENCODE(CH,OPERATION) \
if(CH<0x10000){ \
OPERATION(CH); \
}else{ \
OPERATION(0xD800 | ((CH-0x10000)>>10)); \
OPERATION(0xDC00 | (CH & 0x3FF)); \
}
const int REPLACEMENT_CHARACTER = 0x00FFFD;
#endif
diff --git a/src/WordWrapper.cpp b/src/WordWrapper.cpp
index ceae42b..3bc20cd 100644
--- a/src/WordWrapper.cpp
+++ b/src/WordWrapper.cpp
@@ -1,329 +1,329 @@
/*
* Copyright (C) 2019 Me and My Shadow
*
* This file is part of Me and My Shadow.
*
* Me and My Shadow is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Me and My Shadow is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Me and My Shadow. If not, see <http://www.gnu.org/licenses/>.
*/
#include "WordWrapper.h"
#include "HyphenationManager.h"
#include "HyphenationRule.h"
#include "UTF8Functions.h"
#include <algorithm>
#include <assert.h>
#include <SDL_ttf_fontfallback.h>
int WordWrapper::getTextWidth(const std::string& s) {
if (s.empty()) return 0;
int w = 0;
if (font) {
TTF_SizeUTF8(font, s.c_str(), &w, NULL);
} else {
const size_t m = s.size();
U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(s, i, m, ch, REPLACEMENT_CHARACTER);
w++;
U8STRING_FOR_EACH_CHARACTER_DO_END();
}
return w;
}
int WordWrapper::getGlyphWidth(int ch) {
if (font) {
int w = 0;
TTF_GlyphMetrics(font, ch, NULL, NULL, NULL, NULL, &w);
return w;
} else {
return 1;
}
}
WordWrapper::WordWrapper()
: font(NULL)
, maxWidth(0)
, wordWrap(false)
, reserveHyperlinks(false)
{
}
WordWrapper::~WordWrapper() {
}
bool WordWrapper::isReserved(const std::string& word) {
if (reserveHyperlinks) {
const char *s = word.c_str();
const size_t m = word.size();
for (size_t i = 0; i < m; i++) {
// we only support http or https
if ((s[i] == 'H' || s[i] == 'h')
&& (s[i + 1] == 'T' || s[i + 1] == 't')
&& (s[i + 2] == 'T' || s[i + 2] == 't')
&& (s[i + 3] == 'P' || s[i + 3] == 'p'))
{
if (s[i + 4] == ':' && s[i + 5] == '/' && s[i + 6] == '/') {
// http
return true;
} else if ((s[i + 4] == 'S' || s[i + 4] == 's') && s[i + 5] == ':' && s[i + 6] == '/' && s[i + 7] == '/') {
// https
return true;
}
}
}
}
for (const std::string& s : reservedWords) {
if (word == s) return true;
}
for (const std::string& s : reservedFragments) {
if (word.find(s) != std::string::npos) return true;
}
return false;
}
int WordWrapper::addString(std::vector<std::string>& output, const std::string& input) {
int mw = 0;
std::string line;
for (char c : input) {
if (c == '\r') {
} else if (c == '\n') {
mw = std::max(addLine(output, line), mw);
line.clear();
} else {
line.push_back(c);
}
}
return std::max(addLine(output, line), mw);
}
// Add a word to line, output the line only if the line+newWord doesn't fit the width and in this case put the newWord to the line.
// Returns the maximal width required for this string.
int WordWrapper::addWord(std::vector<std::string>& output, std::string& line, int& lineWidth, const std::string& spaces, const std::string& nonSpaces) {
int w1 = getTextWidth(spaces);
{
int w2 = getTextWidth(nonSpaces);
//Check if it fits into current line.
if (lineWidth + w1 + w2 <= maxWidth) {
line += spaces + nonSpaces;
lineWidth += w1 + w2;
return lineWidth;
}
//Now it doesn't fit into current line.
//Check if we should skip the hyphenation.
if (hyphen.empty() || isReserved(nonSpaces)) {
if (line.empty()) {
//A line consists of at least one word, so we append it forcefully.
line += spaces + nonSpaces;
lineWidth += w1 + w2;
return lineWidth;
} else {
//We output current line.
output.push_back(line);
//And add a new line consisting of new word (but we remove spaces in it).
line = nonSpaces;
int mw = std::max(lineWidth, w2);
lineWidth = w2;
return mw;
}
}
}
auto hm = getHyphenationManager();
auto hyphenator = hyphenatorLanguage.empty() ? hm->getHyphenator() : hm->getHyphenator(hyphenatorLanguage);
auto rules = hyphenator->applyHyphenationRules(nonSpaces);
const size_t m = nonSpaces.size();
std::string tmp, prev;
int skip = 0, prevSkip = 0, prevWidth = 0;
size_t prevIndex = 0;
int mw = lineWidth;
for (size_t i = 0;; i++) {
const Hyphenate::HyphenationRule *rule = (i < m) ? (*rules)[i] : NULL;
if (rule || i == m) {
std::string tmp2 = tmp;
if (rule) rule->apply_first(tmp2, hyphen);
int newWidth = getTextWidth(tmp2);
/*//debug
printf("%-5d %s\n", newWidth, tmp2.c_str());*/
//Check if we should output current line directly.
if (lineWidth + w1 + newWidth > maxWidth && prev.empty() && !line.empty()) {
//We output current line.
output.push_back(line);
mw = std::max(lineWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
}
//Check if the line is still too long.
if (lineWidth + w1 + newWidth > maxWidth) {
//Check if we have previous available hyphenation
if (prev.empty()) {
//Line is empty, we have to append it forcefully.
assert(line.empty());
if (w1 > 0) line += spaces;
line += tmp2;
if (i < m) {
output.push_back(line);
mw = std::max(lineWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
} else {
lineWidth += w1 + newWidth;
mw = std::max(lineWidth, mw);
}
//Update buffer
tmp.clear();
if (rule) skip += rule->apply_second(tmp);
} else {
//We use previous available hyphenation
if (w1 > 0) line += spaces;
output.push_back(line + prev);
mw = std::max(lineWidth + w1 + prevWidth, mw);
line.clear();
lineWidth = 0;
w1 = 0;
//Rewind
prev.clear();
prevWidth = 0;
skip = prevSkip;
i = prevIndex;
//Update buffer
tmp.clear();
rule = (*rules)[i];
assert(rule != NULL);
skip += rule->apply_second(tmp);
}
} else if (i == m) {
//Output last part
if (w1 > 0) line += spaces;
line += tmp2;
lineWidth += w1 + newWidth;
mw = std::max(lineWidth, mw);
} else if (newWidth > prevWidth) {
//Update prev hyphenation
prev = tmp2;
prevSkip = skip;
prevWidth = newWidth;
prevIndex = i;
}
}
if (i >= m) break;
if (skip > 0) skip--;
else tmp.push_back(nonSpaces[i]);
}
return mw;
}
int WordWrapper::addLine(std::vector<std::string>& output, const std::string& input) {
if (!wordWrap) {
//Word wrap is not enabled, simply add it to output
output.push_back(input);
return getTextWidth(input);
}
const size_t m = input.size();
std::string spaces, nonSpaces, line;
int lineWidth = 0, mw = 0;
bool prevIsCJK = false, prevIsCJKStarting = false;
U8STRING_FOR_EACH_CHARACTER_DO_BEGIN(input, i, m, ch, REPLACEMENT_CHARACTER);
//A word consists of a sequence of white spaces and a sequence of non-white-spaces.
//For CJK should only read one CJK character (possibly with a punctuation mark)
if (ch == '\r') {
- } else if (utf32IsSpace(ch)) {
+ } else if (utf32IsBreakableSpace(ch)) {
prevIsCJK = false;
prevIsCJKStarting = false;
if (!nonSpaces.empty()) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
U8_ENCODE(ch, spaces.push_back);
} else {
bool isCJK = utf32IsCJK(ch);
bool isCJKStarting = utf32IsCJKStartingPunctuation(ch);
if (prevIsCJK) {
//Output the CJK character immediately unless current character can't be at start of line
if (!utf32IsCJKEndingPunctuation(ch)) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
} else if (isCJK && !nonSpaces.empty()) {
//Output the existing non-CJK character immediately unless it can't be at end of line
if (!prevIsCJKStarting) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
spaces.clear();
nonSpaces.clear();
}
}
prevIsCJK = isCJK;
prevIsCJKStarting = isCJKStarting;
U8_ENCODE(ch, nonSpaces.push_back);
}
U8STRING_FOR_EACH_CHARACTER_DO_END();
//FIXME: Here we temporarily ignore trailing spaces
if (!nonSpaces.empty()) {
mw = std::max(addWord(output, line, lineWidth, spaces, nonSpaces), mw);
}
//Output the remaining text.
output.push_back(line);
return mw;
}
int WordWrapper::addLines(std::vector<std::string>& output, const std::vector<std::string>& input) {
int mw = 0;
for (const std::string& s : input) {
mw = std::max(addLine(output, s), mw);
}
return mw;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, May 9, 8:04 PM (6 d, 23 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
62875
Default Alt Text
(34 KB)
Attached To
Mode
R79 meandmyshadow
Attached
Detach File
Event Timeline