|
386 | 386 | "cell_type": "markdown", |
387 | 387 | "metadata": {}, |
388 | 388 | "source": [ |
389 | | - "### Thai Character Cluster (TCC)\n", |
| 389 | + "### Subword and Thai Character Cluster (TCC)\n", |
390 | 390 | "\n", |
391 | 391 | "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)." |
392 | 392 | ] |
|
408 | 408 | } |
409 | 409 | ], |
410 | 410 | "source": [ |
411 | | - "from pythainlp.tokenize import subword_tokenize\n", |
| 411 | + "from pythainlp import subword_tokenize\n", |
412 | 412 | "\n", |
413 | 413 | "subword_tokenize(\"ประเทศไทย\")" |
414 | 414 | ] |
415 | 415 | }, |
416 | | - { |
417 | | - "cell_type": "code", |
418 | | - "execution_count": 16, |
419 | | - "metadata": {}, |
420 | | - "outputs": [ |
421 | | - { |
422 | | - "data": { |
423 | | - "text/plain": [ |
424 | | - "False" |
425 | | - ] |
426 | | - }, |
427 | | - "execution_count": 16, |
428 | | - "metadata": {}, |
429 | | - "output_type": "execute_result" |
430 | | - } |
431 | | - ], |
432 | | - "source": [ |
433 | | - "isinstance(subword_tokenize(\"ประเทศไทย\", engine=\"etcc\"), str)" |
434 | | - ] |
435 | | - }, |
436 | 416 | { |
437 | 417 | "cell_type": "markdown", |
438 | 418 | "metadata": {}, |
|
442 | 422 | }, |
443 | 423 | { |
444 | 424 | "cell_type": "code", |
445 | | - "execution_count": 17, |
| 425 | + "execution_count": 16, |
446 | 426 | "metadata": {}, |
447 | 427 | "outputs": [ |
448 | 428 | { |
|
451 | 431 | "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']" |
452 | 432 | ] |
453 | 433 | }, |
454 | | - "execution_count": 17, |
| 434 | + "execution_count": 16, |
455 | 435 | "metadata": {}, |
456 | 436 | "output_type": "execute_result" |
457 | 437 | } |
458 | 438 | ], |
459 | 439 | "source": [ |
460 | | - "from pythainlp import tcc\n", |
| 440 | + "from pythainlp.tokenize import tcc\n", |
461 | 441 | "\n", |
462 | 442 | "tcc.segment(\"ประเทศไทย\")" |
463 | 443 | ] |
464 | 444 | }, |
465 | 445 | { |
466 | 446 | "cell_type": "code", |
467 | | - "execution_count": 18, |
| 447 | + "execution_count": 17, |
468 | 448 | "metadata": {}, |
469 | 449 | "outputs": [ |
470 | 450 | { |
|
473 | 453 | "{1, 3, 5, 6, 8, 9}" |
474 | 454 | ] |
475 | 455 | }, |
476 | | - "execution_count": 18, |
| 456 | + "execution_count": 17, |
477 | 457 | "metadata": {}, |
478 | 458 | "output_type": "execute_result" |
479 | 459 | } |
|
484 | 464 | }, |
485 | 465 | { |
486 | 466 | "cell_type": "code", |
487 | | - "execution_count": 19, |
| 467 | + "execution_count": 18, |
488 | 468 | "metadata": {}, |
489 | 469 | "outputs": [ |
490 | 470 | { |
|
509 | 489 | }, |
510 | 490 | { |
511 | 491 | "cell_type": "code", |
512 | | - "execution_count": 20, |
| 492 | + "execution_count": 19, |
513 | 493 | "metadata": {}, |
514 | 494 | "outputs": [ |
515 | 495 | { |
|
518 | 498 | "'maeo'" |
519 | 499 | ] |
520 | 500 | }, |
521 | | - "execution_count": 20, |
| 501 | + "execution_count": 19, |
522 | 502 | "metadata": {}, |
523 | 503 | "output_type": "execute_result" |
524 | 504 | } |
|
531 | 511 | }, |
532 | 512 | { |
533 | 513 | "cell_type": "code", |
534 | | - "execution_count": 21, |
| 514 | + "execution_count": 20, |
535 | 515 | "metadata": {}, |
536 | 516 | "outputs": [ |
537 | 517 | { |
538 | | - "name": "stdout", |
539 | | - "output_type": "stream", |
540 | | - "text": [ |
541 | | - "mɛːw\n" |
542 | | - ] |
| 518 | + "data": { |
| 519 | + "text/plain": [ |
| 520 | + "'mɛːw'" |
| 521 | + ] |
| 522 | + }, |
| 523 | + "execution_count": 20, |
| 524 | + "metadata": {}, |
| 525 | + "output_type": "execute_result" |
543 | 526 | } |
544 | 527 | ], |
545 | 528 | "source": [ |
546 | 529 | "from pythainlp.transliterate import transliterate\n", |
547 | 530 | "\n", |
548 | | - "print(transliterate(\"แมว\"))" |
| 531 | + "transliterate(\"แมว\")" |
| 532 | + ] |
| 533 | + }, |
| 534 | + { |
| 535 | + "cell_type": "code", |
| 536 | + "execution_count": 21, |
| 537 | + "metadata": {}, |
| 538 | + "outputs": [], |
| 539 | + "source": [ |
| 540 | + "#!pip3 install pythainlp[icu]\n", |
| 541 | + "#transliterate(\"แมว\", engine=\"icu\")" |
549 | 542 | ] |
550 | 543 | }, |
551 | 544 | { |
|
736 | 729 | { |
737 | 730 | "data": { |
738 | 731 | "text/plain": [ |
739 | | - "[('งวงช้าง', 12),\n", |
740 | | - " ('เทิบทาบ', 7),\n", |
741 | | - " ('กริน', 3),\n", |
742 | | - " ('นาภี', 2),\n", |
743 | | - " ('แด่วๆ', 3),\n", |
744 | | - " ('คู่ใจ', 7),\n", |
745 | | - " ('คุณพ่อ', 732),\n", |
746 | | - " ('สิ้น', 755),\n", |
747 | | - " ('เยาะ', 150)]" |
| 732 | + "[('ลุ่น', 4),\n", |
| 733 | + " ('คั่น', 53),\n", |
| 734 | + " ('ไก่ป่า', 29),\n", |
| 735 | + " ('ปริพาชก', 4),\n", |
| 736 | + " ('สิกขาบท', 4),\n", |
| 737 | + " ('คัดลายมือ', 2),\n", |
| 738 | + " ('เลียบ', 53),\n", |
| 739 | + " ('เกือบๆ', 6),\n", |
| 740 | + " ('จันทรคติ', 6)]" |
748 | 741 | ] |
749 | 742 | }, |
750 | 743 | "execution_count": 28, |
|
0 commit comments