diff --git a/src/geometry.rs b/src/geometry.rs index 9df2c5e..69d74c1 100644 --- a/src/geometry.rs +++ b/src/geometry.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::mem; use std::ops::{Add, Deref, DerefMut, Range, SubAssign}; use serde::{Deserialize, Serialize}; @@ -207,6 +208,14 @@ impl AdjustedRange { let end = first.end.max(second.end); Self(Range { start, end }) } + + pub fn overlaps_or_abuts(&self, other: &Self) -> bool { + if self.start > other.start { + other.overlaps_or_abuts(self) + } else { + self.end >= other.start + } + } } #[derive(Debug, Default)] @@ -282,9 +291,27 @@ impl RangeSet { } pub(crate) fn push(&mut self, range: AdjustedRange) { + log::trace!("Pushing range: {range:?}"); match self.overlaps_impl(&range) { Ok(index) => { self.0[index] = AdjustedRange::span_between(&self.0[index], &range); + if index < self.0.len() - 1 && self.0[index].overlaps_or_abuts(&self.0[index + 1]) { + let taken_vec = mem::take(&mut self.0); + self.0 = taken_vec.into_iter().fold(Vec::new(), |mut accum, range| { + if !accum.is_empty() { + let last_index = accum.len() - 1; + if accum[last_index].overlaps_or_abuts(&range) { + accum[last_index] = + AdjustedRange::span_between(&accum[last_index], &range); + } else { + accum.push(range); + } + } else { + accum.push(range); + } + accum + }); + } } Err(index) => { self.0.insert(index, range); @@ -303,15 +330,24 @@ impl RangeSet { } fn overlaps_impl(&self, range: &AdjustedRange) -> Result { - self.0.binary_search_by(|probe| { - if probe.end < range.start { - Ordering::Less - } else if probe.start > range.end { - Ordering::Greater - } else { - Ordering::Equal - } - }) + self.0 + .binary_search_by(|probe| { + if probe.end < range.start { + Ordering::Less + } else if probe.start > range.end { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .map(|index| { + // Ensure we return the first matching index + let mut first_index = index; + while first_index > 0 && self.0[first_index - 1].overlaps_or_abuts(range) { + first_index -= 1; + } + first_index + }) } } @@ -360,6 +396,27 @@ mod tests { assert_eq!(set.0[0].end, AdjustedOffset::from(8)); } + #[test] + fn test_range_set_merges_multiple_overlapping_ranges() { + let mut set = super::RangeSet::new(); + + let range1 = AdjustedRange::new(AdjustedOffset::from(0), AdjustedOffset::from(5)); + let range2 = AdjustedRange::new(AdjustedOffset::from(3), AdjustedOffset::from(18)); + let range3 = AdjustedRange::new(AdjustedOffset::from(10), AdjustedOffset::from(15)); + let range4 = AdjustedRange::new(AdjustedOffset::from(17), AdjustedOffset::from(20)); + let range5 = AdjustedRange::new(AdjustedOffset::from(23), AdjustedOffset::from(25)); + + set.push(range1); + set.push(range3); + set.push(range4); + set.push(range5); + set.push(range2); + + assert_eq!(set.0.len(), 2); + assert_eq!(set.0[0].start, AdjustedOffset::from(0)); + assert_eq!(set.0[0].end, AdjustedOffset::from(20)); + } + #[test] fn test_range_set_merges_adjacent_ranges() { let mut set = super::RangeSet::new(); diff --git a/src/rules/rule003_spelling.rs b/src/rules/rule003_spelling.rs index 0174264..c80eb60 100644 --- a/src/rules/rule003_spelling.rs +++ b/src/rules/rule003_spelling.rs @@ -22,6 +22,7 @@ use super::{ const DICTIONARY: &str = include_str!("./rule003_spelling/dictionary.txt"); +#[derive(Debug, Clone)] enum HyphenatedPart { MaybePrefix, MaybeSuffix, @@ -161,7 +162,7 @@ impl Rule003Spelling { continue; } - if word_as_string.contains('-') { + if word_as_string.contains('-') && !self.is_correct_spelling(&word_as_string, None) { // Deal with hyphenated words let mut hyphenated_tokenizer = WordIterator::new( word, @@ -235,38 +236,46 @@ impl Rule003Spelling { level: LintLevel, errors: &mut Option>, ) { - if word.len() < 2 { + if self.is_correct_spelling(word, hyphenation) { return; } + let error = LintError::new( + self.name(), + Rule003Spelling::message(word), + level, + location, + None, + context, + ); + errors.get_or_insert_with(Vec::new).push(error); + } + + fn is_correct_spelling(&self, word: &str, hyphenation: Option) -> bool { + trace!("Checking spelling of word: {word} with hyphenation: {hyphenation:?}"); + if word.len() < 2 { + return true; + } + if word .chars() - .any(|c| !c.is_ascii_alphabetic() && !is_punctuation(&c)) + .any(|c| !c.is_ascii_alphabetic() && !Self::is_included_punctuation(&c)) { - // Ignore words containing non-English alphabet and number - return; + return true; } let word = Self::normalize_word(word); if self.dictionary.contains(word.as_ref()) { - return; + return true; } if let Some(HyphenatedPart::MaybePrefix) = hyphenation { if self.prefixes.contains(word.as_ref()) { - return; + return true; } } - let error = LintError::new( - self.name(), - Rule003Spelling::message(&word), - level, - location, - None, - context, - ); - errors.get_or_insert_with(Vec::new).push(error); + false } fn normalize_word_range(word: RopeSlice<'_>, offset: usize) -> AdjustedRange { @@ -327,6 +336,21 @@ impl Rule003Spelling { word } } + + fn is_included_punctuation(c: &char) -> bool { + is_punctuation(c) + && (*c == '-' + || *c == '–' + || *c == '—' + || *c == '―' + || *c == '\'' + || *c == '‘' + || *c == '’' + || *c == '“' + || *c == '”' + || *c == '"' + || *c == '.') + } } #[cfg(test)] @@ -686,4 +710,31 @@ mod tests { ); assert!(errors.is_none()); } + + #[test] + fn test_rule003_bare_prefixes() { + let mdx = "pre- and post-world"; + let parse_result = parse(mdx).unwrap(); + let context = RuleContext::new(parse_result, None).unwrap(); + + let mut rule = Rule003Spelling::default(); + let settings = RuleSettings::with_array_of_strings("prefixes", vec!["pre", "post"]); + rule.setup(Some(&settings)); + + let errors = rule.check( + context + .ast() + .children() + .unwrap() + .get(0) + .unwrap() + .children() + .unwrap() + .get(0) + .unwrap(), + &context, + LintLevel::Error, + ); + assert!(errors.is_none()); + } } diff --git a/src/rules/rule003_spelling/dictionary.txt b/src/rules/rule003_spelling/dictionary.txt index 821d5fc..e88198a 100644 --- a/src/rules/rule003_spelling/dictionary.txt +++ b/src/rules/rule003_spelling/dictionary.txt @@ -249,6 +249,10 @@ family 254164055 long 252519588 based 252405204 code 250245121 +hardcode 250245121 +hardcoded 250245121 +hardcodes 250245121 +hardcoding 250245121 show 247541986 even 245697701 black 244690155 @@ -258,6 +262,7 @@ special 244311841 prices 243435728 website 242876222 index 242826246 +reindex 242826246 being 242783091 women 242520455 much 242326300 @@ -517,6 +522,7 @@ equipment 144298238 login 144200144 signin 144200144 sign-in 144200144 +signout 144200144 signup 144200144 student 143590165 let 143062438 @@ -561,6 +567,7 @@ club 132428495 example 132369252 girls 132325396 additional 132187702 +passcode 132141721 password 132141721 passwordless 132141721 latest 131952173 @@ -598,6 +605,8 @@ al 125787543 easy 125548527 given 125542966 files 125524478 +filesystem 125524478 +filesystems 125524478 event 125515260 release 125340846 analysis 124949540 @@ -618,6 +627,7 @@ star 122498186 areas 121986327 future 121844371 space 121505269 +whitespace 121505269 committee 121345834 hand 121296661 sun 121218500 @@ -944,6 +954,10 @@ countries 83082522 loss 83015916 face 82807978 brand 82807581 +rebrand 82807581 +rebranded 82807581 +rebranding 82807581 +rebrands 82807581 discount 82646962 higher 82515596 effects 82399410 @@ -1140,6 +1154,7 @@ george 68261971 choice 68222335 went 68130142 starting 68080206 +deregistration 68025615 registration 68025615 fri 67934088 thursday 67751544 @@ -1591,6 +1606,8 @@ surface 50022259 minimum 50001652 visual 49969150 host 49918570 +hostname 49918570 +hostnames 49918570 variety 49844591 teachers 49832283 martin 49699093 @@ -1662,6 +1679,7 @@ funding 47661725 devices 47613452 lord 47612536 grant 47609624 +regrant 47609624 sub 47588521 agree 47500911 fiction 47449450 @@ -2054,6 +2072,7 @@ joe 37642746 guys 37603892 integrated 37600088 configuration 37589833 +misconfiguration 37589833 preconfiguration 37589833 cock 37573028 merchant 37569902 @@ -2376,6 +2395,7 @@ depth 31879926 iowa 31875822 whatever 31869767 logged 31869263 +unlogged 31869263 laptop 31851245 vintage 31830831 train 31816971 @@ -2982,6 +3002,7 @@ hat 24383742 institution 24373421 directed 24347062 dealers 24342319 +searchable 24320613 searching 24320613 sporting 24320042 helping 24319316 @@ -3030,6 +3051,7 @@ cam 23995601 curriculum 23992488 logic 23989892 template 23980628 +templating 23980628 prince 23975036 circle 23962760 soil 23949346 @@ -3216,6 +3238,7 @@ securities 22397133 allen 22368068 relation 22355933 enabled 22345126 +reenabled 22345126 genre 22344091 slide 22339084 montana 22338359 @@ -3326,6 +3349,7 @@ stocks 21451319 buyers 21450595 journals 21443275 gray 21424658 +catalog 21424300 catalogue 21424300 jennifer 21402386 antonio 21388530 @@ -3405,6 +3429,7 @@ findings 20757470 muscle 20752802 featuring 20737262 implement 20723680 +reimplement 20723680 clicking 20719323 scheduled 20699522 polls 20694349 @@ -3535,6 +3560,8 @@ satisfaction 19684544 represents 19682589 char 19673918 indexed 19672520 +reindexed 19672520 +unindexed 19672520 pittsburgh 19654781 superior 19647999 preferred 19640247 @@ -3877,6 +3904,7 @@ taste 17311343 dealing 17303748 commit 17298630 tiny 17298541 +inoperational 17293992 operational 17293992 rail 17293553 airlines 17293401 @@ -4425,6 +4453,7 @@ nobody 14490265 ghost 14490042 violation 14467817 configure 14461355 +misconfigure 14461355 preconfigure 14461355 stability 14458903 applying 14440499 @@ -4528,6 +4557,7 @@ penalty 14018500 drum 14015510 glasses 14009815 enables 14005681 +reenables 14005681 iraqi 13999381 builder 13993483 vista 13991160 @@ -4797,6 +4827,7 @@ proven 12920984 schedules 12917359 admissions 12907686 cached 12887996 +uncached 12887996 warren 12887808 slip 12886015 studied 12881982 @@ -5004,6 +5035,7 @@ advertisers 12223306 moments 12221536 atlas 12221120 strings 12221039 +substrings 12221039 dawn 12219521 representing 12219202 observation 12217876 @@ -5094,6 +5126,7 @@ opposite 11965213 understood 11962620 rapidly 11959135 ban 11940116 +unban 11940116 temp 11938391 intro 11938092 mercedes 11935820 @@ -6103,8 +6136,10 @@ composed 9274507 tears 9274206 performances 9272429 performant 9272429 +performantly 9272429 oasis 9270498 baseline 9270123 +baselining 9270123 cab 9268943 angry 9267505 societies 9260047 @@ -6245,6 +6280,7 @@ univ 8991617 tub 8990811 rider 8987802 scheduling 8986610 +unscheduling 8986610 radius 8985161 perspectives 8983024 mortality 8978176 @@ -6623,6 +6659,7 @@ chess 8293823 operates 8291329 brisbane 8291276 configured 8289360 +misconfigured 8289360 preconfigured 8289360 survive 8286260 oscar 8285553 @@ -6662,6 +6699,7 @@ nickname 8226050 fiji 8219554 technician 8218658 inline 8218587 +inlined 8218587 executives 8215657 enquiries 8215393 washing 8213049 @@ -6694,6 +6732,7 @@ saskatchewan 8186212 cancellation 8183513 plugins 8183425 enrolled 8182105 +unenrolled 8182105 sensors 8181414 screw 8177263 ministers 8175323 @@ -7153,6 +7192,8 @@ priest 7401793 floyd 7401451 ronald 7400733 analysts 7400631 +dequeue 7398569 +enqueue 7398569 queue 7398569 trance 7392026 locale 7391821 @@ -7418,6 +7459,8 @@ actress 7010056 mess 7008727 conferencing 7008203 assign 7007787 +reassign 7007787 +unassign 7007787 armstrong 7006855 politicians 7005074 lit 7003064 @@ -7585,6 +7628,7 @@ satisfactory 6785226 revolutionary 6784780 bracelets 6784037 sync 6783321 +async 6783321 civilian 6782491 telephony 6781709 mesa 6780960 @@ -7987,6 +8031,7 @@ trainers 6312095 enhancements 6307063 renewable 6305969 intersection 6304801 +passcodes 6302572 passwords 6302572 sewing 6301885 consistency 6300897 @@ -8007,6 +8052,7 @@ tactics 6277023 trusts 6275089 occurring 6273414 supplemental 6271983 +traveling 6271787 travelling 6271787 talented 6268788 annie 6267924 @@ -8532,6 +8578,7 @@ learners 5675657 selective 5674677 arbitration 5674307 configuring 5673670 +misconfiguring 5673670 preconfiguring 5673670 token 5672353 editorials 5671680 @@ -9182,6 +9229,7 @@ outlets 5061151 swaziland 5058010 varieties 5057493 configurations 5056310 +misconfigurations 5056310 preconfigurations 5056310 poison 5056083 ethnicity 5055334 @@ -9357,6 +9405,7 @@ suburbs 4899548 imagery 4898768 chromosome 4898576 optimized 4898373 +unoptimized 4898373 sears 4897480 flies 4894680 upgraded 4893959 @@ -9554,6 +9603,8 @@ puppies 4733987 relaxing 4733863 delphi 4732371 trophy 4731863 +emoji 4731715 +emojis 4731715 emotion 4731715 buick 4731433 slipknot 4731098 @@ -9707,6 +9758,7 @@ halls 4607236 alzheimer 4606962 decorations 4606704 pause 4606318 +unpause 4606318 simplicity 4606054 postscript 4604982 dividends 4604638 @@ -9964,6 +10016,7 @@ steak 4413680 commits 4413217 cobra 4412823 subset 4412351 +superset 4412351 gucci 4412128 threw 4410853 sutton 4410096 @@ -10733,6 +10786,7 @@ etiquette 3872794 rookie 3870767 environ 3870498 theatrical 3867947 +colored 3867584 coloured 3867584 births 3867031 cubs 3863171 @@ -10999,7 +11053,9 @@ meridian 3717338 marriages 3717000 regret 3716789 revalidate 3715972 +revalidator 3715972 validate 3715972 +validator 3715972 stakes 3715795 rotating 3715660 brigade 3711855 @@ -11017,6 +11073,7 @@ executing 3703273 greenwich 3699730 flooding 3699696 parse 3698919 +reparse 3698919 rugged 3698606 jelly 3698541 implementations 3697837 @@ -11405,6 +11462,7 @@ personalities 3493064 discography 3493008 stiff 3492842 encoded 3492618 +unencoded 3492618 researching 3490518 noah 3490320 wore 3490088 @@ -11493,6 +11551,7 @@ grandma 3450933 customization 3450194 gigs 3449514 indexing 3449511 +reindexing 3449511 lori 3449295 oceans 3449148 displacement 3448688 @@ -11586,6 +11645,7 @@ conditioned 3411409 prohibition 3410059 motions 3409091 redirect 3408086 +interop 3408052 interoperability 3408052 tuvalu 3404820 shampoo 3404715 @@ -12083,6 +12143,7 @@ riviera 3178598 apprentice 3177886 obscure 3177425 napoleon 3177380 +deregistrations 3177147 registrations 3177147 wavelength 3176899 glamour 3176022 @@ -12706,6 +12767,7 @@ milford 2898231 buckle 2898037 bartlett 2897791 fetch 2897593 +prefetch 2897593 kitchens 2897588 ions 2896976 wat 2896705 @@ -13051,6 +13113,7 @@ journeys 2751143 milestones 2750959 parkinson 2750688 parsing 2750227 +reparsing 2750227 splitting 2749527 mclean 2748981 derbyshire 2748810 @@ -14328,6 +14391,8 @@ rocker 2322314 acknowledges 2321868 alas 2321771 enrolment 2321704 +enrollment 2321704 +unenrollment 2321704 sawyer 2321394 maori 2321308 lawmakers 2321236 @@ -14427,6 +14492,7 @@ spurs 2295629 sion 2295561 crashed 2295284 appraisals 2294795 +traveled 2294125 travelled 2294125 urgency 2294018 flashes 2293952 @@ -14706,6 +14772,7 @@ rivera 2216114 dermatology 2215849 lied 2215843 sandbox 2215370 +sandboxed 2215370 bloc 2215168 cambridgeshire 2214799 premiership 2214457 @@ -15382,6 +15449,7 @@ sloan 2040295 pudding 2040237 flawed 2040094 checkpoint 2040010 +checkpointing 2040010 rosenberg 2039748 plato 2039668 examiners 2039353 @@ -16518,6 +16586,7 @@ multiplied 1790425 enchanted 1789839 belgrade 1789716 styled 1789390 +unstyled 1789390 commanders 1789190 thor 1789033 waive 1789027 @@ -17996,6 +18065,7 @@ upheld 1520200 manifestation 1520142 malt 1520084 subsets 1520074 +supersets 1520074 blazers 1519924 merritt 1519662 triad 1519533 @@ -19462,6 +19532,7 @@ regatta 1298113 rested 1297976 chatroom 1297966 paused 1297918 +unpaused 1297918 macbeth 1297856 polarity 1297690 overrides 1297543 @@ -20828,6 +20899,7 @@ hostages 1136029 swahili 1135933 rosario 1135784 enrolling 1135687 +unenrolling 1135687 fruitful 1135613 franks 1135609 commercialization 1135539 @@ -21793,6 +21865,7 @@ imprinted 1037643 pixie 1037554 proofing 1037507 clits 1037485 +keychain 1037390 keyring 1037390 bereavement 1037389 surrendered 1037373 @@ -22766,6 +22839,7 @@ relic 947341 teton 947032 newline 946974 slipper 946965 +deprioritize 946919 prioritize 946919 clashes 946852 augsburg 946750 @@ -23596,6 +23670,7 @@ taping 879253 somatic 879135 hepburn 879009 fetched 878933 +prefetched 878933 alderman 878712 slump 878659 nerds 878641 @@ -23605,6 +23680,7 @@ coughing 878491 hiatus 878386 enrol 878274 enroll 878274 +unenroll 878274 upholstered 878180 evangelist 878138 louvre 878086 @@ -23735,6 +23811,7 @@ descendant 868120 disgust 868088 deterrent 868064 banked 867916 +deduplicating 867869 duplicating 867869 rationality 867752 screwing 867732 @@ -24445,6 +24522,7 @@ waning 815920 cartwright 815878 glycoprotein 815668 armoire 815525 +enqueued 815362 queued 815362 sab 815338 hydroxide 815265 @@ -24795,6 +24873,7 @@ zucchini 789907 mares 789891 enthusiastically 789840 fetching 789835 +prefetching 789835 chaps 789805 lanai 789800 tendon 789636 @@ -25396,6 +25475,7 @@ powering 753767 logins 753667 signins 753667 sign-ins 753667 +signouts 753667 signups 753667 sadism 753627 butchers 753573 @@ -27044,7 +27124,8 @@ disruptions 658563 erasure 658545 fishy 658490 preamp 658453 -pauses 658415 +paused 1297918 +unpaused 658415 ziegler 658262 loewe 658236 intoxication 658221 @@ -28169,6 +28250,7 @@ ignatius 600857 alternates 600819 emperors 600721 configures 600721 +misconfigures 600721 preconfigures 600721 multilevel 600671 renoir 600542 @@ -28437,6 +28519,7 @@ exiles 588295 wheatley 588291 clapping 588220 finesse 588192 +fine-tune 588192 blitzkrieg 588026 nickels 587964 cordelia 587959 @@ -30402,6 +30485,7 @@ invariance 506198 darkly 506092 faithfulness 506045 resourceful 505988 +resourcefully 505988 pleural 505955 mediating 505915 heraldry 505866 @@ -31268,6 +31352,8 @@ callbacks 472475 arduous 472353 replicates 472334 sidewinder 472328 +dequeueing 472224 +enqueueing 472224 queueing 472224 slugger 472200 humidifiers 472176 @@ -33404,6 +33490,7 @@ syrups 403791 smirk 403780 estimations 403732 pausing 403730 +unpausing 403730 guesswork 403656 grands 403644 replete 403555 @@ -33426,6 +33513,8 @@ bristle 403221 terrors 403174 uriah 403108 oblige 403041 +timeframe 402983 +timeframes 402983 timepieces 402983 nonfarm 402968 anklet 402960 @@ -34377,6 +34466,7 @@ parkin 375624 wagers 375520 ravioli 375470 enrolments 375459 +enrollments 375459 walling 375420 jointed 375370 ribosome 375345 @@ -34547,6 +34637,7 @@ breaching 371021 maelstrom 371002 rivalries 371000 gnomes 370965 +deprioritizing 370912 prioritizing 370912 affectionately 370906 uneducated 370877 @@ -39415,6 +39506,7 @@ chairing 264278 birdhouses 264278 screech 264240 fetches 264162 +prefetches 264162 tradable 264150 jami 264138 axillary 264136 @@ -41162,6 +41254,7 @@ centrifuges 234736 colonnade 234702 anhydride 234668 overburden 234664 +indexable 234647 indexation 234647 abides 234643 architecturally 234640 @@ -45264,6 +45357,7 @@ wilfully 178531 burro 178515 tricycles 178514 paralysed 178491 +paralyzed 178491 organelle 178479 ramakrishna 178467 distanced 178454 @@ -51853,6 +51947,7 @@ motorcars 115390 stragglers 115386 scowl 115386 tinder 115380 +deprioritizes 115380 prioritizes 115380 neuroma 115379 backfield 115379 @@ -63821,6 +63916,8 @@ introspect 52710 ionians 52705 tyros 52703 hepcat 52700 +geospace 52697 +geospatial 52697 geosphere 52697 deoxyribonuclease 52696 stela 52675 @@ -67118,6 +67215,7 @@ shrinker 42313 sharkskin 42310 fudged 42308 processable 42300 +unprocessable 42300 terrorised 42298 understandability 42297 loverly 42297 @@ -70438,6 +70536,7 @@ canoeist 33670 ministrative 33669 autograft 33667 disfigure 33659 +disform 33659 shoemaking 33657 rostand 33653 soporific 33650 @@ -71705,6 +71804,7 @@ sashay 30994 mounding 30991 crouton 30987 paralysing 30978 +paralyzing 30978 adjudicates 30978 rookeries 30975 blackcap 30973 @@ -73373,6 +73473,7 @@ cockamamie 27556 staved 27555 silversides 27555 paralyse 27553 +paralyze 27553 supernaturals 27550 sigmas 27550 harrar 27549 @@ -77806,6 +77907,7 @@ deodorize 19667 browband 19667 precipitately 19665 paralyses 19664 +paralyzes 19664 buzzkill 19663 unceremonious 19661 sugarcoat 19653 @@ -77861,6 +77963,8 @@ foregut 19574 libidinal 19572 invt 19572 enrols 19572 +enrolls 19572 +unenrolls 19572 cyclopropane 19571 loges 19568 frolicked 19568 diff --git a/src/utils/words.rs b/src/utils/words.rs index 2594355..d01f6a4 100644 --- a/src/utils/words.rs +++ b/src/utils/words.rs @@ -284,7 +284,15 @@ impl WordParser { ParserNext::Continue } ParseState::PunctuationTrailing(punctuation) => { - let word_end_offset = self.tracking_offset.saturating_sub(punctuation.len()); + // If the word ends with a hyphen, preserve the hyphen so we + // can capture bare prefixes like `pre-` and `post-` + let preserve_punctuation = punctuation == "-"; + + let word_end_offset = if preserve_punctuation { + self.tracking_offset + } else { + self.tracking_offset.saturating_sub(punctuation.len()) + }; let curr_capitalize = self.capitalize; if let Some(p) = punctuation.chars().last() { @@ -429,6 +437,31 @@ impl WordParser { } } +pub fn is_punctuation(c: &char) -> bool { + *c == '!' + || *c == '-' + || *c == '–' + || *c == '—' + || *c == '―' + || *c == '(' + || *c == ')' + || *c == '[' + || *c == ']' + || *c == '{' + || *c == '}' + || *c == ':' + || *c == '\'' + || *c == '‘' + || *c == '’' + || *c == '“' + || *c == '”' + || *c == '"' + || *c == '?' + || *c == ',' + || *c == '.' + || *c == ';' +} + #[cfg(test)] mod tests { use super::*; @@ -529,6 +562,21 @@ mod tests { assert!(iter.next().is_none()); } + #[test] + fn test_word_iterator_include_hyphen_on_bare_prefixes() { + let rope = Rope::from("pre- and post-world"); + let slice = rope.byte_slice(..); + let mut iter = WordIterator::new(slice, 0, Default::default()); + + let (offset, word, _cap) = iter.next().unwrap(); + assert_eq!(offset, 0); + assert_eq!(word.to_string(), "pre-"); + + let (offset, word, _cap) = iter.nth(1).unwrap(); + assert_eq!(offset, 9); + assert_eq!(word.to_string(), "post-world"); + } + #[test] fn test_word_iterator_with_emoji() { let rope = Rope::from("hello 🤝 world"); @@ -763,28 +811,3 @@ mod tests { } } } - -pub fn is_punctuation(c: &char) -> bool { - *c == '!' - || *c == '-' - || *c == '–' - || *c == '—' - || *c == '―' - || *c == '(' - || *c == ')' - || *c == '[' - || *c == ']' - || *c == '{' - || *c == '}' - || *c == ':' - || *c == '\'' - || *c == '‘' - || *c == '’' - || *c == '“' - || *c == '”' - || *c == '"' - || *c == '?' - || *c == ',' - || *c == '.' - || *c == ';' -}