Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,8 @@ nosetests.xml
.pydevproject

# PyCharm projects
*idea/*
*idea/*

.python-version
poetry.lock
.pytest_cache/
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,12 @@ Generate a random CSV file with a schema.
3. str = random string
4. ip = random (possibly legal) ip address (IPv4)
5. date = random date and time stamp
6. word = random namealizer word
7. pipewords = one to three pipedelimited random namealizer words
6. word = random word
7. pipewords = one to three pipedelimited random words
8. level = simulated log entries
9. sentence = random sentence with a maximum number of words
10. url = random url

Namealizer words generated using the project at
https://github.com/LeonardMH/namealizer

## Usage

Get help with `-h`
Expand Down
22 changes: 22 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[tool.poetry]
name = "random-csv"
version = "1.0.0"
description = "Generate a random csv file"
authors = ["Lucie Masson <lucie.masson@lengow.com>"]
readme = "README.md"
license = "MIT"
repository = "https://github.com/lengow/random-csv"
packages = [{include = "random_csv"}]

[tool.poetry.dependencies]
python = ">=3.10,<4.0"
Faker = "*"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[dependency-groups]
test = [
"pytest (>=9.0.1,<10.0.0)"
]
122 changes: 21 additions & 101 deletions random_csv/generate_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from enum import Enum
from functools import partial

import namealizer
from faker import Faker


class Level(Enum):
Expand All @@ -19,16 +19,6 @@ class Level(Enum):
INFO = 5


class CardinalNS(Enum):
N = 1
S = 2


class CardinalEW(Enum):
W = 1
E = 2


def csv_generator(rows, schema, sentence_max_size, desc_max_size, categories_size, header, seed):
"""
generator of random csv lines
Expand All @@ -44,8 +34,11 @@ def csv_generator(rows, schema, sentence_max_size, desc_max_size, categories_siz
"""

# initializations of generators and charset
wg = namealizer.WordGenerator(seed=seed)
wg.dictionary = OrderedDict(sorted(wg.dictionary.items(), key=lambda x:x[1], reverse=True))
fake = Faker()
if seed is not None:
Faker.seed(seed)
random.seed(seed)

generators = []
char_set = (string.ascii_letters + string.digits + ' ')
categories = []
Expand All @@ -62,7 +55,7 @@ def choose_category_element(i):
if column == 'int':
intcount += 1
head.append('number_' + str(idcount))
generators.append(lambda: random.randint(0, 1e9))
generators.append(lambda: random.randint(0, 1000000000))
if column == 'id':
idcount += 1
head.append('id_' + str(idcount))
Expand All @@ -71,102 +64,55 @@ def choose_category_element(i):
elif column == 'str':
strcount += 1
head.append('text_' + str(strcount))
generators.append(lambda: ''.join(
random.choice(char_set) for _ in range(12)))
generators.append(lambda: fake.pystr(min_chars=12, max_chars=12))
elif column == 'float':
floatcount += 1
head.append('float_' + str(floatcount))
generators.append(lambda: random.randint(0, 1e4)+random.random())
generators.append(lambda: random.randint(0, 10000)+random.random())
elif column == 'ip':
ipcount += 1
head.append('ip_' + str(ipcount))
# http://stackoverflow.com/a/21014713 Thanks jonrsharpe
generators.append(lambda: ''.join(
".".join(map(str, (random.randint(0, 255)
for _ in range(4))))
))
generators.append(lambda: fake.ipv4())
elif column == 'date':
datecount += 1
head.append('date_' + str(datecount))
generators.append(lambda: ''.join(
datetime.fromtimestamp(random.randint(0, 1e10)).strftime("%d/%m/%y_%H:%M")
))
generators.append(lambda: fake.date_time().strftime("%d/%m/%y_%H:%M"))
elif column == 'word':
wordcount += 1
head.append('label_' + str(wordcount))
generators.append(lambda: ''.join(
generateword(wg)
))
generators.append(lambda: fake.word())
elif column == 'category':
categorycount += 1
elements = wg[categories_size]
categories.append(elements.split())
elements = fake.words(nb=categories_size)
categories.append(elements)
head.append('category_' + str(categorycount))
generators.append(partial(choose_category_element, categorycount-1))
elif column == 'pipewords':
pipewordscount += 1
head.append('pipe_' + str(pipewordscount))
generators.append(lambda: ''.join(
generatepipewords(wg)
))
generators.append(lambda: "|".join(fake.words(nb=3)))
elif column == 'sentence':
sentencecount += 1
head.append('sentence_' + str(sentencecount))
generators.append(lambda: ''.join(
generatesentence(sentence_max_size, wg)
))
generators.append(lambda: fake.sentence(nb_words=sentence_max_size))
elif column == 'description':
desccount += 1
head.append('description_' + str(desccount))
generators.append(lambda: ''.join(
generatesentence(desc_max_size, wg)
))
generators.append(lambda: fake.text(max_nb_chars=desc_max_size).replace('\n', ' '))
elif column == 'url':
urlcount += 1
head.append('url_' + str(urlcount))
generators.append(lambda: ''.join(
generateurl(wg)
))
generators.append(lambda: fake.url())
elif column == 'level':
levelcount += 1;
head.append('level' + str(levelcount))
generators.append(lambda: ''.join(
Level(random.randint(1, 5)).name
))
generators.append(lambda: Level(random.randint(1, 5)).name)
elif column == 'lat':
head.append('latitude')
generators.append(lambda: ''.join(
"".join(map(str,
(random.randint(0, 89),
"°",
" ",
random.randint(0, 59),
"′",
" ",
random.randint(0, 59),
".",
random.randint(1, 99),
"″",
" ",
CardinalNS(random.randint(1, 2)).name))),
))
generators.append(lambda: str(fake.latitude()))
elif column == 'long':
head.append('longitude')
generators.append(lambda: ''.join(
"".join(map(str,
(random.randint(0, 179),
"°",
" ",
random.randint(0, 59),
"′",
" ",
random.randint(0, 59),
".",
random.randint(1, 99),
"″",
" ",
CardinalEW(random.randint(1, 2)).name))),
))
generators.append(lambda: str(fake.longitude()))

# return the header at first call if specified
if header:
Expand All @@ -179,32 +125,6 @@ def choose_category_element(i):
n += 1


def generateword(wg):
return wg[1]


def generatesentence(max_nb_words, wg):
sentence_size = random.randint(1, max_nb_words)
words = wg[sentence_size]
return words


def generatepipewords(wg):
words = generatesentence(3, wg)
retval = words.replace(' ', '|')
return retval


def generateurl(wg):
domain_gen = generatesentence(2, wg)
domain = domain_gen.replace(' ', '.') + '.' + generateword(wg)[:3]
path_gen = generatesentence(3, wg)
path = path_gen.replace(' ', '/')
file = generateword(wg) + '.' + generateword(wg)[:3]
retval = "/".join([random.choice(['http:/', 'https:/']), domain, path, file])
return retval


def generateid(i, generated_ids, max_size):
id = random.randint(1, max_size)
nb_call = 0
Expand Down
17 changes: 0 additions & 17 deletions setup.py

This file was deleted.

79 changes: 79 additions & 0 deletions tests/test_generate_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import pytest
from random_csv.generate_csv import csv_generator

def test_csv_generator_basic():
rows = 10
schema = ['int', 'str', 'word']
gen = csv_generator(rows, schema, 10, 100, 10, False, None)

data = list(gen)
assert len(data) == rows
for row in data:
assert len(row) == 3
assert isinstance(row[0], int)
assert isinstance(row[1], str)
assert isinstance(row[2], str)

def test_csv_generator_header():
rows = 5
schema = ['int', 'str']
gen = csv_generator(rows, schema, 10, 100, 10, True, None)

data = list(gen)
assert len(data) == rows + 1 # Header + rows
header = data[0]
assert header == ['number_0', 'text_1']

def test_csv_generator_seed_determinism():
rows = 5
schema = ['int', 'str', 'ip']
seed = 42

gen1 = csv_generator(rows, schema, 10, 100, 10, False, seed)
data1 = list(gen1)

gen2 = csv_generator(rows, schema, 10, 100, 10, False, seed)
data2 = list(gen2)

assert data1 == data2

def test_csv_generator_all_types():
rows = 2
schema = [
'int', 'str', 'float', 'ip', 'date', 'word',
'pipewords', 'level', 'lat', 'long', 'sentence',
'url', 'id', 'category', 'description'
]
gen = csv_generator(rows, schema, 10, 100, 10, False, None)
data = list(gen)

assert len(data) == rows
for row in data:
assert len(row) == len(schema)
# Basic type checks
assert isinstance(row[0], int) # int
assert isinstance(row[1], str) # str
assert isinstance(row[2], float) # float
assert isinstance(row[3], str) # ip
assert len(row[3].split('.')) == 4
assert isinstance(row[4], str) # date
assert isinstance(row[5], str) # word
assert '|' in row[6] # pipewords
assert row[7] in ['CRITICAL', 'SEVERE', 'MODERATE', 'MILD', 'INFO'] # level
assert isinstance(row[8], str) # lat
assert isinstance(row[9], str) # long
assert isinstance(row[10], str) # sentence
assert isinstance(row[11], str) # url
assert row[11].startswith('http')
assert isinstance(row[12], int) # id
assert isinstance(row[13], str) # category (returns one word from the list)
assert isinstance(row[14], str) # description

def test_csv_generator_id_uniqueness():
rows = 100
schema = ['id']
gen = csv_generator(rows, schema, 10, 100, 10, False, None)
data = list(gen)

ids = [row[0] for row in data]
assert len(ids) == len(set(ids))