diff --git a/.gitignore b/.gitignore index 3e8190d..9093343 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,8 @@ nosetests.xml .pydevproject # PyCharm projects -*idea/* \ No newline at end of file +*idea/* + +.python-version +poetry.lock +.pytest_cache/ \ No newline at end of file diff --git a/README.md b/README.md index f275322..28270b7 100644 --- a/README.md +++ b/README.md @@ -14,15 +14,12 @@ Generate a random CSV file with a schema. 3. str = random string 4. ip = random (possibly legal) ip address (IPv4) 5. date = random date and time stamp -6. word = random namealizer word -7. pipewords = one to three pipedelimited random namealizer words +6. word = random word +7. pipewords = one to three pipedelimited random words 8. level = simulated log entries 9. sentence = random sentence with a maximum number of words 10. url = random url -Namealizer words generated using the project at -https://github.com/LeonardMH/namealizer - ## Usage Get help with `-h` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7bbb616 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "random-csv" +version = "1.0.0" +description = "Generate a random csv file" +authors = ["Lucie Masson "] +readme = "README.md" +license = "MIT" +repository = "https://github.com/lengow/random-csv" +packages = [{include = "random_csv"}] + +[tool.poetry.dependencies] +python = ">=3.10,<4.0" +Faker = "*" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[dependency-groups] +test = [ + "pytest (>=9.0.1,<10.0.0)" +] diff --git a/random_csv/generate_csv.py b/random_csv/generate_csv.py index 8d46028..29c7654 100644 --- a/random_csv/generate_csv.py +++ b/random_csv/generate_csv.py @@ -8,7 +8,7 @@ from enum import Enum from functools import partial -import namealizer +from faker import Faker class Level(Enum): @@ -19,16 +19,6 @@ class Level(Enum): INFO = 5 -class CardinalNS(Enum): - N = 1 - S = 2 - - -class CardinalEW(Enum): - W = 1 - E = 2 - - def csv_generator(rows, schema, sentence_max_size, desc_max_size, categories_size, header, seed): """ generator of random csv lines @@ -44,8 +34,11 @@ def csv_generator(rows, schema, sentence_max_size, desc_max_size, categories_siz """ # initializations of generators and charset - wg = namealizer.WordGenerator(seed=seed) - wg.dictionary = OrderedDict(sorted(wg.dictionary.items(), key=lambda x:x[1], reverse=True)) + fake = Faker() + if seed is not None: + Faker.seed(seed) + random.seed(seed) + generators = [] char_set = (string.ascii_letters + string.digits + ' ') categories = [] @@ -62,7 +55,7 @@ def choose_category_element(i): if column == 'int': intcount += 1 head.append('number_' + str(idcount)) - generators.append(lambda: random.randint(0, 1e9)) + generators.append(lambda: random.randint(0, 1000000000)) if column == 'id': idcount += 1 head.append('id_' + str(idcount)) @@ -71,102 +64,55 @@ def choose_category_element(i): elif column == 'str': strcount += 1 head.append('text_' + str(strcount)) - generators.append(lambda: ''.join( - random.choice(char_set) for _ in range(12))) + generators.append(lambda: fake.pystr(min_chars=12, max_chars=12)) elif column == 'float': floatcount += 1 head.append('float_' + str(floatcount)) - generators.append(lambda: random.randint(0, 1e4)+random.random()) + generators.append(lambda: random.randint(0, 10000)+random.random()) elif column == 'ip': ipcount += 1 head.append('ip_' + str(ipcount)) - # http://stackoverflow.com/a/21014713 Thanks jonrsharpe - generators.append(lambda: ''.join( - ".".join(map(str, (random.randint(0, 255) - for _ in range(4)))) - )) + generators.append(lambda: fake.ipv4()) elif column == 'date': datecount += 1 head.append('date_' + str(datecount)) - generators.append(lambda: ''.join( - datetime.fromtimestamp(random.randint(0, 1e10)).strftime("%d/%m/%y_%H:%M") - )) + generators.append(lambda: fake.date_time().strftime("%d/%m/%y_%H:%M")) elif column == 'word': wordcount += 1 head.append('label_' + str(wordcount)) - generators.append(lambda: ''.join( - generateword(wg) - )) + generators.append(lambda: fake.word()) elif column == 'category': categorycount += 1 - elements = wg[categories_size] - categories.append(elements.split()) + elements = fake.words(nb=categories_size) + categories.append(elements) head.append('category_' + str(categorycount)) generators.append(partial(choose_category_element, categorycount-1)) elif column == 'pipewords': pipewordscount += 1 head.append('pipe_' + str(pipewordscount)) - generators.append(lambda: ''.join( - generatepipewords(wg) - )) + generators.append(lambda: "|".join(fake.words(nb=3))) elif column == 'sentence': sentencecount += 1 head.append('sentence_' + str(sentencecount)) - generators.append(lambda: ''.join( - generatesentence(sentence_max_size, wg) - )) + generators.append(lambda: fake.sentence(nb_words=sentence_max_size)) elif column == 'description': desccount += 1 head.append('description_' + str(desccount)) - generators.append(lambda: ''.join( - generatesentence(desc_max_size, wg) - )) + generators.append(lambda: fake.text(max_nb_chars=desc_max_size).replace('\n', ' ')) elif column == 'url': urlcount += 1 head.append('url_' + str(urlcount)) - generators.append(lambda: ''.join( - generateurl(wg) - )) + generators.append(lambda: fake.url()) elif column == 'level': levelcount += 1; head.append('level' + str(levelcount)) - generators.append(lambda: ''.join( - Level(random.randint(1, 5)).name - )) + generators.append(lambda: Level(random.randint(1, 5)).name) elif column == 'lat': head.append('latitude') - generators.append(lambda: ''.join( - "".join(map(str, - (random.randint(0, 89), - "°", - " ", - random.randint(0, 59), - "′", - " ", - random.randint(0, 59), - ".", - random.randint(1, 99), - "″", - " ", - CardinalNS(random.randint(1, 2)).name))), - )) + generators.append(lambda: str(fake.latitude())) elif column == 'long': head.append('longitude') - generators.append(lambda: ''.join( - "".join(map(str, - (random.randint(0, 179), - "°", - " ", - random.randint(0, 59), - "′", - " ", - random.randint(0, 59), - ".", - random.randint(1, 99), - "″", - " ", - CardinalEW(random.randint(1, 2)).name))), - )) + generators.append(lambda: str(fake.longitude())) # return the header at first call if specified if header: @@ -179,32 +125,6 @@ def choose_category_element(i): n += 1 -def generateword(wg): - return wg[1] - - -def generatesentence(max_nb_words, wg): - sentence_size = random.randint(1, max_nb_words) - words = wg[sentence_size] - return words - - -def generatepipewords(wg): - words = generatesentence(3, wg) - retval = words.replace(' ', '|') - return retval - - -def generateurl(wg): - domain_gen = generatesentence(2, wg) - domain = domain_gen.replace(' ', '.') + '.' + generateword(wg)[:3] - path_gen = generatesentence(3, wg) - path = path_gen.replace(' ', '/') - file = generateword(wg) + '.' + generateword(wg)[:3] - retval = "/".join([random.choice(['http:/', 'https:/']), domain, path, file]) - return retval - - def generateid(i, generated_ids, max_size): id = random.randint(1, max_size) nb_call = 0 diff --git a/setup.py b/setup.py deleted file mode 100644 index 3e14a1c..0000000 --- a/setup.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from distutils.core import setup - -setup( - name='random_csv', - version='1.0.0', - author='Lucie Masson', - author_email='lucie.masson@lengow.com', - packages=['random_csv'], - url='https://github.com/lengow/random-csv', - license='LICENSE.txt', - description='Generate a random csv file', - long_description=open('README.md').read(), - install_requires=['namealizer'], -) \ No newline at end of file diff --git a/tests/test_generate_csv.py b/tests/test_generate_csv.py new file mode 100644 index 0000000..2afdf62 --- /dev/null +++ b/tests/test_generate_csv.py @@ -0,0 +1,79 @@ +import pytest +from random_csv.generate_csv import csv_generator + +def test_csv_generator_basic(): + rows = 10 + schema = ['int', 'str', 'word'] + gen = csv_generator(rows, schema, 10, 100, 10, False, None) + + data = list(gen) + assert len(data) == rows + for row in data: + assert len(row) == 3 + assert isinstance(row[0], int) + assert isinstance(row[1], str) + assert isinstance(row[2], str) + +def test_csv_generator_header(): + rows = 5 + schema = ['int', 'str'] + gen = csv_generator(rows, schema, 10, 100, 10, True, None) + + data = list(gen) + assert len(data) == rows + 1 # Header + rows + header = data[0] + assert header == ['number_0', 'text_1'] + +def test_csv_generator_seed_determinism(): + rows = 5 + schema = ['int', 'str', 'ip'] + seed = 42 + + gen1 = csv_generator(rows, schema, 10, 100, 10, False, seed) + data1 = list(gen1) + + gen2 = csv_generator(rows, schema, 10, 100, 10, False, seed) + data2 = list(gen2) + + assert data1 == data2 + +def test_csv_generator_all_types(): + rows = 2 + schema = [ + 'int', 'str', 'float', 'ip', 'date', 'word', + 'pipewords', 'level', 'lat', 'long', 'sentence', + 'url', 'id', 'category', 'description' + ] + gen = csv_generator(rows, schema, 10, 100, 10, False, None) + data = list(gen) + + assert len(data) == rows + for row in data: + assert len(row) == len(schema) + # Basic type checks + assert isinstance(row[0], int) # int + assert isinstance(row[1], str) # str + assert isinstance(row[2], float) # float + assert isinstance(row[3], str) # ip + assert len(row[3].split('.')) == 4 + assert isinstance(row[4], str) # date + assert isinstance(row[5], str) # word + assert '|' in row[6] # pipewords + assert row[7] in ['CRITICAL', 'SEVERE', 'MODERATE', 'MILD', 'INFO'] # level + assert isinstance(row[8], str) # lat + assert isinstance(row[9], str) # long + assert isinstance(row[10], str) # sentence + assert isinstance(row[11], str) # url + assert row[11].startswith('http') + assert isinstance(row[12], int) # id + assert isinstance(row[13], str) # category (returns one word from the list) + assert isinstance(row[14], str) # description + +def test_csv_generator_id_uniqueness(): + rows = 100 + schema = ['id'] + gen = csv_generator(rows, schema, 10, 100, 10, False, None) + data = list(gen) + + ids = [row[0] for row in data] + assert len(ids) == len(set(ids))