From a505e602d3ff01019489932a40b98764a16b8ef0 Mon Sep 17 00:00:00 2001 From: Richard Ngo-Lam Date: Wed, 2 Oct 2024 00:25:50 +0200 Subject: [PATCH] add tweak --- crawler.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/crawler.py b/crawler.py index cf67cee..b7f9d5a 100644 --- a/crawler.py +++ b/crawler.py @@ -1,25 +1,23 @@ import re import string import numpy as np +from collections import defaultdict alphabet = string.ascii_uppercase + ' ' def cbpd_from_book(book): text = sanitize(open(book, 'r').read()) - g = {} - for c1 in alphabet: - g[c1] = {} - for c2 in alphabet: - g[c1][c2] = 0 - + + g = defaultdict(lambda: defaultdict(int)) + for i in range(0, len(text) - 1): g[text[i]][text[i + 1]] += 1 - for c1 in g.keys(): + for c1 in g: total = sum(g[c1].values()) - for c2 in g[c1].keys(): - g[c1][c2] = g[c1][c2] / total + for c2 in g[c1]: + g[c1][c2] /= total return g