From 6d644130f7f7467f4e7f8878a71cda7a26a2a032 Mon Sep 17 00:00:00 2001 From: Ricardo Signes Date: Thu, 1 Oct 2020 13:30:35 -0400 Subject: [PATCH] cope with non-ASCII in bullet lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is two changes based on real-life data I'm formatting. 1. accept that "•" is as much a bullet as "*" or "+" 2. accept that a NBSP after a bullet is still a space; it's fine that it's NBSP, because we don't break there anyway! --- lib/Text/Autoformat.pm | 2 +- lib/Text/Autoformat/Hang.pm | 6 ++++-- t/04.non-ascii.t | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 t/04.non-ascii.t diff --git a/lib/Text/Autoformat.pm b/lib/Text/Autoformat.pm index b883a0c..385a1b5 100644 --- a/lib/Text/Autoformat.pm +++ b/lib/Text/Autoformat.pm @@ -181,7 +181,7 @@ sub autoformat # ($text, %args) $lines[-1]{hang} = Text::Autoformat::Hang->new($_, $args{lists}); - s/([ \t]*)(.*?)(\s*)$// + s/([ \x{A0}\t]*)(.*?)(\s*)$// or die "Internal Error ($@) on '$_'"; $lines[-1]{hangspace} = defn $1; $lines[-1]{text} = defn $2; diff --git a/lib/Text/Autoformat/Hang.pm b/lib/Text/Autoformat/Hang.pm index ce691b9..099e473 100644 --- a/lib/Text/Autoformat/Hang.pm +++ b/lib/Text/Autoformat/Hang.pm @@ -4,6 +4,8 @@ use 5.006; use strict; use warnings; +use utf8; + # ROMAN NUMERALS sub inv($@) { my ($k, %inv)=shift; for(0..$#_) {$inv{$_[$_]}=$_*$k} %inv } @@ -46,8 +48,8 @@ my %close = ( '[' => ']', '(' => ')', '<' => '>', "" => '' ); my $hangPS = qq{(?i:ps:|(?:p\\.?)+s\\b\\.?(?:[ \\t]*:)?)}; my $hangNB = qq{(?i:n\\.?b\\.?(?:[ \\t]*:)?)}; my $hangword = qq{(?:(?:Note)[ \\t]*:)}; -my $hangbullet = qq{[*.+-]}; -my $hang = qq{(?:(?i)(?:$hangNB|$hangword|$hangbullet)(?=[ \t]))}; +my $hangbullet = qq{[•*.+-]}; +my $hang = qq{(?:(?i)(?:$hangNB|$hangword|$hangbullet)(?=[ \x{A0}\t]))}; # IMPLEMENTATION diff --git a/t/04.non-ascii.t b/t/04.non-ascii.t new file mode 100644 index 0000000..61dbc95 --- /dev/null +++ b/t/04.non-ascii.t @@ -0,0 +1,19 @@ +use utf8; +use strict; +use Test::More tests => 1; +use Text::Autoformat; + +# Possibly I'm breaking this on EBCDIC… -- rjbs, 2020-10-01 +my $NBSP = "\x{A0}"; + +my $str = <<"END"; +•${NBSP}Analyze problem +•${NBSP}Design algorithm +• Code solution +• Test +• Ship +END + +my $after = autoformat $str; + +is($after, $str, 'we treat \N{BULLET} as a bullet and NBSP after it as space');