Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 394 additions & 0 deletions huggingface_pipelines/hf_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"from datasets import load_dataset\n",
"ds = load_dataset(\"HuggingFaceTB/cosmopedia\", \"stories\", split='train[0:10]') # , streaming=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/torch/cuda/__init__.py:628: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n",
"/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
" from huggingface_pipelines.text import (\n",
" EmbeddingToTextPipelineConfig,\n",
" HFEmbeddingToTextPipeline,\n",
" HFTextToEmbeddingPipeline,\n",
" TextToEmbeddingPipelineConfig, \n",
" TextSegmentationPipelineConfig, TextSegmentationPipeline,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:huggingface_pipelines.text:Text preprocessing model initialized.\n"
]
}
],
"source": [
"# !python -m spacy download en_core_web_sm\n",
"text_to_segment_config = TextSegmentationPipelineConfig(columns=['text'], output_path='./output',\n",
" fill_value='N/A', source_lang='eng_Latn', handle_missing='fill', \n",
" output_column_suffix=\"sentences\")\n",
"pipeline_text2sent = TextSegmentationPipeline(text_to_segment_config)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"text_to_embedding_config = TextToEmbeddingPipelineConfig(\n",
" encoder_model=\"text_sonar_basic_encoder\",\n",
" columns=[\"text_sentences\"],\n",
" output_column_suffix=\"embedding\",\n",
" batch_size=2,\n",
" device=\"cpu\",\n",
" source_lang=\"eng_Latn\",\n",
" output_path=\"test\",\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:huggingface_pipelines.text:Initializing text to embedding model...\n"
]
}
],
"source": [
"pipeline_sent2emb = HFTextToEmbeddingPipeline(text_to_embedding_config)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:huggingface_pipelines.pipeline:Starting to process dataset...\n",
"Processing dataset: 100%|██████████| 10/10 [02:40<00:00, 16.04s/ examples]\n"
]
}
],
"source": [
"ds = pipeline_text2sent(ds)\n",
"ds = pipeline_sent2emb(ds)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['text',\n",
" 'prompt',\n",
" 'text_token_length',\n",
" 'seed_data',\n",
" 'format',\n",
" 'audience',\n",
" 'text_sentences',\n",
" 'text_sentences_embedding']"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.column_names"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>prompt</th>\n",
" <th>text_token_length</th>\n",
" <th>seed_data</th>\n",
" <th>format</th>\n",
" <th>audience</th>\n",
" <th>text_sentences</th>\n",
" <th>text_sentences_embedding</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Once upon a time, in a village called Kiwilan...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>520</td>\n",
" <td>ultrachat</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[Once upon a time, in a village called Kiwilan...</td>\n",
" <td>[[0.009837754, 0.003806148, -0.008218781, 0.00...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>In a bustling town full of curious creatures ...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>381</td>\n",
" <td>openhermes2.5</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[In a bustling town full of curious creatures ...</td>\n",
" <td>[[0.0026006007, -0.008539987, 0.009180809, 0.0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Step 3: Embracing an Unconventional Warmup Ro...</td>\n",
" <td>Write a real-life story shared by someone in a...</td>\n",
" <td>580</td>\n",
" <td>openhermes2.5</td>\n",
" <td>story_reddit</td>\n",
" <td>general</td>\n",
" <td>[Step 3: Embracing an Unconventional Warmup Ro...</td>\n",
" <td>[[-0.0052720755, 0.0026521664, 0.018520469, 0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Once upon a time, in a small town named Harmo...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>439</td>\n",
" <td>ultrachat</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[Once upon a time, in a small town named Harmo...</td>\n",
" <td>[[-0.0006834645, 0.0034085037, -0.011185894, 0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>On a bright, sunny day, two best friends, Tim...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>414</td>\n",
" <td>openhermes2.5</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[On a bright, sunny day, two best friends, Tim...</td>\n",
" <td>[[-0.013423425, 0.007448226, -0.010567971, 0.0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>In the bustling city of New York, there was a...</td>\n",
" <td>Write a story that explores a situation slight...</td>\n",
" <td>677</td>\n",
" <td>openhermes2.5</td>\n",
" <td>story_life_lessons</td>\n",
" <td>general</td>\n",
" <td>[In the bustling city of New York, there was a...</td>\n",
" <td>[[0.01869477, -0.0056158905, 0.0034837904, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Once upon a time in a small village named Pix...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>618</td>\n",
" <td>ultrachat</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[Once upon a time in a small village named Pix...</td>\n",
" <td>[[0.010998854, -0.003700477, 0.010283143, 0.01...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A couple of months ago, I stumbled upon an in...</td>\n",
" <td>Write a real-life story shared by someone in a...</td>\n",
" <td>755</td>\n",
" <td>openhermes2.5</td>\n",
" <td>story_reddit</td>\n",
" <td>general</td>\n",
" <td>[A couple of months ago, I stumbled upon an in...</td>\n",
" <td>[[0.0069475807, 0.005131935, 0.0044895806, 0.0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>I've always loved living in Murcia, Spain - i...</td>\n",
" <td>Write a story in the style of real-life situat...</td>\n",
" <td>629</td>\n",
" <td>ultrachat</td>\n",
" <td>story_forums</td>\n",
" <td>general</td>\n",
" <td>[I've always loved living in Murcia, Spain - i...</td>\n",
" <td>[[0.00025026768, 0.0066834814, 0.009650952, 0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Once upon a time, in a bustling city called N...</td>\n",
" <td>Write an educational story (3-5 paragraphs) ta...</td>\n",
" <td>509</td>\n",
" <td>ultrachat</td>\n",
" <td>story_children</td>\n",
" <td>young_children</td>\n",
" <td>[Once upon a time, in a bustling city called N...</td>\n",
" <td>[[0.0053570517, 0.0042380523, 0.0048442776, 0....</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 Once upon a time, in a village called Kiwilan... \n",
"1 In a bustling town full of curious creatures ... \n",
"2 Step 3: Embracing an Unconventional Warmup Ro... \n",
"3 Once upon a time, in a small town named Harmo... \n",
"4 On a bright, sunny day, two best friends, Tim... \n",
"5 In the bustling city of New York, there was a... \n",
"6 Once upon a time in a small village named Pix... \n",
"7 A couple of months ago, I stumbled upon an in... \n",
"8 I've always loved living in Murcia, Spain - i... \n",
"9 Once upon a time, in a bustling city called N... \n",
"\n",
" prompt text_token_length \\\n",
"0 Write an educational story (3-5 paragraphs) ta... 520 \n",
"1 Write an educational story (3-5 paragraphs) ta... 381 \n",
"2 Write a real-life story shared by someone in a... 580 \n",
"3 Write an educational story (3-5 paragraphs) ta... 439 \n",
"4 Write an educational story (3-5 paragraphs) ta... 414 \n",
"5 Write a story that explores a situation slight... 677 \n",
"6 Write an educational story (3-5 paragraphs) ta... 618 \n",
"7 Write a real-life story shared by someone in a... 755 \n",
"8 Write a story in the style of real-life situat... 629 \n",
"9 Write an educational story (3-5 paragraphs) ta... 509 \n",
"\n",
" seed_data format audience \\\n",
"0 ultrachat story_children young_children \n",
"1 openhermes2.5 story_children young_children \n",
"2 openhermes2.5 story_reddit general \n",
"3 ultrachat story_children young_children \n",
"4 openhermes2.5 story_children young_children \n",
"5 openhermes2.5 story_life_lessons general \n",
"6 ultrachat story_children young_children \n",
"7 openhermes2.5 story_reddit general \n",
"8 ultrachat story_forums general \n",
"9 ultrachat story_children young_children \n",
"\n",
" text_sentences \\\n",
"0 [Once upon a time, in a village called Kiwilan... \n",
"1 [In a bustling town full of curious creatures ... \n",
"2 [Step 3: Embracing an Unconventional Warmup Ro... \n",
"3 [Once upon a time, in a small town named Harmo... \n",
"4 [On a bright, sunny day, two best friends, Tim... \n",
"5 [In the bustling city of New York, there was a... \n",
"6 [Once upon a time in a small village named Pix... \n",
"7 [A couple of months ago, I stumbled upon an in... \n",
"8 [I've always loved living in Murcia, Spain - i... \n",
"9 [Once upon a time, in a bustling city called N... \n",
"\n",
" text_sentences_embedding \n",
"0 [[0.009837754, 0.003806148, -0.008218781, 0.00... \n",
"1 [[0.0026006007, -0.008539987, 0.009180809, 0.0... \n",
"2 [[-0.0052720755, 0.0026521664, 0.018520469, 0.... \n",
"3 [[-0.0006834645, 0.0034085037, -0.011185894, 0... \n",
"4 [[-0.013423425, 0.007448226, -0.010567971, 0.0... \n",
"5 [[0.01869477, -0.0056158905, 0.0034837904, -0.... \n",
"6 [[0.010998854, -0.003700477, 0.010283143, 0.01... \n",
"7 [[0.0069475807, 0.005131935, 0.0044895806, 0.0... \n",
"8 [[0.00025026768, 0.0066834814, 0.009650952, 0.... \n",
"9 [[0.0053570517, 0.0042380523, 0.0048442776, 0.... "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# !pip install polars\n",
"# ds.to_pandas()\n",
"ds.to_polars()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "sonar",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}