Skip to content
This repository was archived by the owner on Aug 14, 2023. It is now read-only.

Commit 331448d

Browse files
committed
Enhanced the Crawler.kt using Kotlin Coroutine Channels
1 parent 44f9c93 commit 331448d

File tree

7 files changed

+124
-57
lines changed

7 files changed

+124
-57
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ Alternatively, you can download the jar file from the releases section or build
6666
## Features
6767

6868
- Poor man's Crawler
69+
- Search with trigger
70+
- User tweets limit
71+
- Depth limit
72+
- User info
6973
- More features (WIP)
7074

7175
## License

build.gradle.kts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ plugins {
88
}
99

1010
group = "io.github.yamin8000"
11-
version = "1.0.0"
11+
version = "1.0.1"
1212

1313
repositories {
1414
mavenCentral()

src/main/kotlin/io/github/yamin8000/twitterscrapper/helpers/ConsoleHelper.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ object ConsoleHelper {
8585
*/
8686
fun readBoolean(message: String? = null): Boolean {
8787
return try {
88-
if (message != null) t.println(askStyle(message))
88+
if (message != null) t.println(askStyle("${message}(y/n)"))
8989
readCleanLine().lowercase(Locale.getDefault()) in affirmatives
9090
} catch (exception: Exception) {
9191
false

src/main/kotlin/io/github/yamin8000/twitterscrapper/modules/crawler/Crawler.kt

Lines changed: 72 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -9,94 +9,111 @@ import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.resultStyle
99
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.t
1010
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.warningStyle
1111
import io.github.yamin8000.twitterscrapper.util.Constants
12+
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_DEPTH_LIMIT
1213
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_TWEETS_LIMIT
1314
import io.github.yamin8000.twitterscrapper.util.Constants.PROTECTED_ACCOUNT
15+
import io.github.yamin8000.twitterscrapper.util.KTree
1416
import io.github.yamin8000.twitterscrapper.util.Utility.csvOf
1517
import io.github.yamin8000.twitterscrapper.web.retryingGet
1618
import kotlinx.coroutines.*
19+
import kotlinx.coroutines.channels.Channel
1720
import org.jsoup.Jsoup
1821
import org.jsoup.select.Elements
1922
import java.io.File
2023

2124
class Crawler(
22-
private val isNested: Boolean = true,
25+
isNested: Boolean = true
2326
) {
2427
private val scope = CoroutineScope(Dispatchers.IO)
2528

26-
private var startingUsers = listOf<String>()
29+
private val channel = Channel<Job>(4)
30+
31+
private val startingUsers: List<String>
2732

2833
private var tweetCountLimit = DEFAULT_CRAWL_TWEETS_LIMIT
2934

30-
private var triggers: List<String> = listOf()
35+
private var triggers = listOf<String>()
36+
37+
private var root: KTree<String>? = null
38+
39+
private var depthLimit = DEFAULT_CRAWL_DEPTH_LIMIT
3140

3241
init {
3342
startingUsers = readMultipleStrings("Starting user").map { it.sanitizeUser() }
34-
if (readBoolean("Do you want to limit the number of tweets for each user?(y/n)")) {
35-
tweetCountLimit = readInteger(
36-
message = "Enter tweet limit for each user.",
37-
range = 1..DEFAULT_CRAWL_TWEETS_LIMIT
38-
)
43+
if (readBoolean("Do you want to customize the crawler?")) {
44+
if (readBoolean("Do you want to limit the number of tweets for each user?")) {
45+
tweetCountLimit = readInteger(
46+
message = "Enter tweet limit for each user.",
47+
range = 1..DEFAULT_CRAWL_TWEETS_LIMIT
48+
)
49+
}
50+
if (isNested) {
51+
if (readBoolean("Do you want to specify crawl depth limit?"))
52+
depthLimit = readInteger("Crawl depth limit")
53+
} else depthLimit = 1
54+
if (readBoolean("Do you want to filter tweets with Trigger words?"))
55+
triggers = readMultipleStrings("Trigger word")
3956
}
40-
if (readBoolean("Do you want to filter tweets with Trigger words?(y/n)"))
41-
triggers = readMultipleStrings("Trigger word")
4257
}
4358

59+
@OptIn(ExperimentalCoroutinesApi::class)
4460
suspend fun crawl() {
45-
buildList {
46-
startingUsers.forEach { user ->
47-
add(scope.launch { singleUserCrawler(user) })
61+
startingUsers.forEach { user ->
62+
channel.send(singleUserCrawler(user))
63+
}
64+
while (true) {
65+
if (channel.isEmpty) {
66+
t.println(resultStyle("Crawler Stopped!"))
67+
break
4868
}
49-
}.joinAll()
69+
delay(5000)
70+
}
5071
}
5172

5273
private suspend fun singleUserCrawler(
5374
username: String
54-
) {
75+
): Job = scope.launch {
76+
if (root == null) root = KTree(username)
77+
5578
t.println(infoStyle("Crawling: ") + resultStyle(username))
5679

5780
if (!File("${Constants.DOWNLOAD_PATH}/$username.txt").exists()) {
58-
var tweetCount = 0
59-
try {
60-
crawlUsername(username) { elements ->
61-
t.println(infoStyle("New results for $username"))
62-
val tweets = getSingles(elements)
63-
val tweetsWithTriggers = mutableListOf<String>()
64-
triggers.forEach { trigger ->
65-
tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
66-
}
67-
68-
var newTweets = tweets
69-
var newTweetsCount = newTweets.size
70-
if (triggers.isNotEmpty()) {
71-
newTweets = tweetsWithTriggers
72-
newTweetsCount = tweetsWithTriggers.size
73-
}
81+
val (tweets, friends) = crawlUsername(username, tweetCountLimit)
82+
t.println(infoStyle("New results for $username"))
83+
val tweetsWithTriggers = mutableListOf<String>()
84+
triggers.forEach { trigger ->
85+
tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
86+
}
7487

75-
if (newTweets.isNotEmpty())
76-
saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())
77-
else t.println(warningStyle("Empty tweets for $username"))
88+
val newTweets = if (triggers.isNotEmpty()) tweetsWithTriggers else tweets
7889

79-
tweetCount += newTweetsCount
80-
if (tweetCount >= tweetCountLimit) throw Exception("Tweet count limit reached for $username")
90+
val node = root?.findDescendant(username) ?: root
91+
t.println(infoStyle("$username, Tree level: ${node?.level}"))
92+
if (tweets.isNotEmpty() && node != null && node.level <= depthLimit) {
93+
if (newTweets.isNotEmpty())
94+
saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())
8195

82-
val friends = fetchNewUsers(elements.html())
83-
.map { it.sanitizeUser() }
84-
.filter { it != username }
85-
if (isNested)
86-
friends.forEach { scope.launch { singleUserCrawler(it) } }
96+
friends.forEach {
97+
node.addChild(it)
8798
}
88-
} catch (e: Exception) {
89-
t.println(errorStyle(e.message ?: ""))
90-
}
99+
if (depthLimit >= 1) {
100+
node.children().filter { it.level <= depthLimit }.forEach {
101+
channel.send(singleUserCrawler(it.data))
102+
}
103+
}
104+
} else t.println(warningStyle("Empty tweets for $username"))
91105
} else t.println(warningStyle("$username is already being crawled"))
106+
channel.receive()
92107
}
93108

94109
private suspend fun crawlUsername(
95110
username: String,
96-
onNewElements: (Elements) -> Unit
97-
) {
111+
limit: Int
112+
): Pair<List<String>, List<String>> {
98113
var cursor: String? = ""
99114
var html: String
115+
val tweets = mutableSetOf<String>()
116+
val friends = mutableSetOf<String>()
100117
do {
101118
html = withContext(scope.coroutineContext) { retryingGet("$username?cursor=$cursor")?.body?.string() ?: "" }
102119
if (html.contains(PROTECTED_ACCOUNT)) {
@@ -113,8 +130,15 @@ class Crawler(
113130
?.attr("href")
114131
?.split('=')
115132
?.last()
116-
onNewElements(doc.allElements)
133+
tweets.addAll(getSingles(doc.allElements))
134+
friends.addAll(
135+
fetchNewUsers(html)
136+
.map { it.sanitizeUser() }
137+
.filter { it != username }
138+
)
139+
if (tweets.take(limit).size >= limit) break
117140
} while (cursor != null)
141+
return tweets.take(limit) to friends.toList()
118142
}
119143

120144
private fun getSingles(
@@ -130,15 +154,9 @@ class Crawler(
130154
t.println(infoStyle("Saving $username tweets"))
131155
val file = File("${Constants.DOWNLOAD_PATH}/$username.txt")
132156

133-
var bias = 0
134-
var headers: List<String>? = listOf("#", "tweet")
135-
if (file.exists()) {
136-
bias = file.readText().split("\n").size
137-
headers = null
138-
}
157+
val headers = listOf("#", "tweet")
139158

140159
val csv = csvOf(
141-
indexBias = bias,
142160
headers = headers,
143161
data = tweets,
144162
itemBuilder = { index, item ->

src/main/kotlin/io/github/yamin8000/twitterscrapper/modules/crawler/CrawlerModule.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class CrawlerModule : BaseModule(Menus.crawlerMenu) {
2323
) {
2424
val crawler = Crawler(isNested)
2525
runBlocking {
26-
withContext(Dispatchers.Default) {
26+
withContext(Dispatchers.Unconfined) {
2727
crawler.crawl()
2828
}
2929
}

src/main/kotlin/io/github/yamin8000/twitterscrapper/util/Constants.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ object Constants {
1010
var DOWNLOAD_PATH = "c:\\TwitterScrapper"
1111

1212
const val DEFAULT_CRAWL_TWEETS_LIMIT = 500
13+
const val DEFAULT_CRAWL_DEPTH_LIMIT = 3
1314

1415
val FAILED_REQUEST_DELAY = 50L..500L
1516

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package io.github.yamin8000.twitterscrapper.util
2+
3+
class KTree<T>(root : T) {
4+
var parent: KTree<T>? = null
5+
get() = field
6+
var data: T = root
7+
get() = field
8+
9+
val isRoot: Boolean
10+
get() = parent == null
11+
12+
val isLeaf: Boolean
13+
get() = directChildren.isEmpty()
14+
15+
val level: Int
16+
get() = if (isRoot) 0 else (parent?.level ?: 0) + 1
17+
18+
private var directChildren = mutableListOf<KTree<T>>()
19+
20+
private var descendants = mutableListOf<KTree<T>>()
21+
22+
fun children() = directChildren.toList()
23+
24+
fun root(): KTree<T> = if (this.parent == null) this else this.root()
25+
26+
fun addChild(child: T): KTree<T> {
27+
val childNode = KTree(child)
28+
childNode.parent = this
29+
directChildren.add(childNode)
30+
addDescendant(childNode)
31+
parent?.addDescendant(childNode)
32+
return childNode
33+
}
34+
35+
private fun addDescendant(descendant: KTree<T>) {
36+
descendants.add(descendant)
37+
}
38+
39+
fun findChild(node: T) = directChildren.find { it.data == node }
40+
41+
fun findDescendant(node: T) = findChild(node) ?: descendants.find { it.data == node }
42+
43+
override fun toString() = data.toString()
44+
}

0 commit comments

Comments
 (0)