@@ -9,94 +9,111 @@ import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.resultStyle
9
9
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.t
10
10
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.warningStyle
11
11
import io.github.yamin8000.twitterscrapper.util.Constants
12
+ import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_DEPTH_LIMIT
12
13
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_TWEETS_LIMIT
13
14
import io.github.yamin8000.twitterscrapper.util.Constants.PROTECTED_ACCOUNT
15
+ import io.github.yamin8000.twitterscrapper.util.KTree
14
16
import io.github.yamin8000.twitterscrapper.util.Utility.csvOf
15
17
import io.github.yamin8000.twitterscrapper.web.retryingGet
16
18
import kotlinx.coroutines.*
19
+ import kotlinx.coroutines.channels.Channel
17
20
import org.jsoup.Jsoup
18
21
import org.jsoup.select.Elements
19
22
import java.io.File
20
23
21
24
class Crawler (
22
- private val isNested : Boolean = true ,
25
+ isNested : Boolean = true
23
26
) {
24
27
private val scope = CoroutineScope (Dispatchers .IO )
25
28
26
- private var startingUsers = listOf<String >()
29
+ private val channel = Channel <Job >(4 )
30
+
31
+ private val startingUsers: List <String >
27
32
28
33
private var tweetCountLimit = DEFAULT_CRAWL_TWEETS_LIMIT
29
34
30
- private var triggers: List <String > = listOf ()
35
+ private var triggers = listOf<String >()
36
+
37
+ private var root: KTree <String >? = null
38
+
39
+ private var depthLimit = DEFAULT_CRAWL_DEPTH_LIMIT
31
40
32
41
init {
33
42
startingUsers = readMultipleStrings(" Starting user" ).map { it.sanitizeUser() }
34
- if (readBoolean(" Do you want to limit the number of tweets for each user?(y/n)" )) {
35
- tweetCountLimit = readInteger(
36
- message = " Enter tweet limit for each user." ,
37
- range = 1 .. DEFAULT_CRAWL_TWEETS_LIMIT
38
- )
43
+ if (readBoolean(" Do you want to customize the crawler?" )) {
44
+ if (readBoolean(" Do you want to limit the number of tweets for each user?" )) {
45
+ tweetCountLimit = readInteger(
46
+ message = " Enter tweet limit for each user." ,
47
+ range = 1 .. DEFAULT_CRAWL_TWEETS_LIMIT
48
+ )
49
+ }
50
+ if (isNested) {
51
+ if (readBoolean(" Do you want to specify crawl depth limit?" ))
52
+ depthLimit = readInteger(" Crawl depth limit" )
53
+ } else depthLimit = 1
54
+ if (readBoolean(" Do you want to filter tweets with Trigger words?" ))
55
+ triggers = readMultipleStrings(" Trigger word" )
39
56
}
40
- if (readBoolean(" Do you want to filter tweets with Trigger words?(y/n)" ))
41
- triggers = readMultipleStrings(" Trigger word" )
42
57
}
43
58
59
+ @OptIn(ExperimentalCoroutinesApi ::class )
44
60
suspend fun crawl () {
45
- buildList {
46
- startingUsers.forEach { user ->
47
- add(scope.launch { singleUserCrawler(user) })
61
+ startingUsers.forEach { user ->
62
+ channel.send(singleUserCrawler(user))
63
+ }
64
+ while (true ) {
65
+ if (channel.isEmpty) {
66
+ t.println (resultStyle(" Crawler Stopped!" ))
67
+ break
48
68
}
49
- }.joinAll()
69
+ delay(5000 )
70
+ }
50
71
}
51
72
52
73
private suspend fun singleUserCrawler (
53
74
username : String
54
- ) {
75
+ ): Job = scope.launch {
76
+ if (root == null ) root = KTree (username)
77
+
55
78
t.println (infoStyle(" Crawling: " ) + resultStyle(username))
56
79
57
80
if (! File (" ${Constants .DOWNLOAD_PATH } /$username .txt" ).exists()) {
58
- var tweetCount = 0
59
- try {
60
- crawlUsername(username) { elements ->
61
- t.println (infoStyle(" New results for $username " ))
62
- val tweets = getSingles(elements)
63
- val tweetsWithTriggers = mutableListOf<String >()
64
- triggers.forEach { trigger ->
65
- tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
66
- }
67
-
68
- var newTweets = tweets
69
- var newTweetsCount = newTweets.size
70
- if (triggers.isNotEmpty()) {
71
- newTweets = tweetsWithTriggers
72
- newTweetsCount = tweetsWithTriggers.size
73
- }
81
+ val (tweets, friends) = crawlUsername(username, tweetCountLimit)
82
+ t.println (infoStyle(" New results for $username " ))
83
+ val tweetsWithTriggers = mutableListOf<String >()
84
+ triggers.forEach { trigger ->
85
+ tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
86
+ }
74
87
75
- if (newTweets.isNotEmpty())
76
- saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())
77
- else t.println (warningStyle(" Empty tweets for $username " ))
88
+ val newTweets = if (triggers.isNotEmpty()) tweetsWithTriggers else tweets
78
89
79
- tweetCount + = newTweetsCount
80
- if (tweetCount >= tweetCountLimit) throw Exception (" Tweet count limit reached for $username " )
90
+ val node = root?.findDescendant(username) ? : root
91
+ t.println (infoStyle(" $username , Tree level: ${node?.level} " ))
92
+ if (tweets.isNotEmpty() && node != null && node.level <= depthLimit) {
93
+ if (newTweets.isNotEmpty())
94
+ saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())
81
95
82
- val friends = fetchNewUsers(elements.html())
83
- .map { it.sanitizeUser() }
84
- .filter { it != username }
85
- if (isNested)
86
- friends.forEach { scope.launch { singleUserCrawler(it) } }
96
+ friends.forEach {
97
+ node.addChild(it)
87
98
}
88
- } catch (e: Exception ) {
89
- t.println (errorStyle(e.message ? : " " ))
90
- }
99
+ if (depthLimit >= 1 ) {
100
+ node.children().filter { it.level <= depthLimit }.forEach {
101
+ channel.send(singleUserCrawler(it.data))
102
+ }
103
+ }
104
+ } else t.println (warningStyle(" Empty tweets for $username " ))
91
105
} else t.println (warningStyle(" $username is already being crawled" ))
106
+ channel.receive()
92
107
}
93
108
94
109
private suspend fun crawlUsername (
95
110
username : String ,
96
- onNewElements : ( Elements ) -> Unit
97
- ) {
111
+ limit : Int
112
+ ): Pair < List < String >, List<String>> {
98
113
var cursor: String? = " "
99
114
var html: String
115
+ val tweets = mutableSetOf<String >()
116
+ val friends = mutableSetOf<String >()
100
117
do {
101
118
html = withContext(scope.coroutineContext) { retryingGet(" $username ?cursor=$cursor " )?.body?.string() ? : " " }
102
119
if (html.contains(PROTECTED_ACCOUNT )) {
@@ -113,8 +130,15 @@ class Crawler(
113
130
?.attr(" href" )
114
131
?.split(' =' )
115
132
?.last()
116
- onNewElements(doc.allElements)
133
+ tweets.addAll(getSingles(doc.allElements))
134
+ friends.addAll(
135
+ fetchNewUsers(html)
136
+ .map { it.sanitizeUser() }
137
+ .filter { it != username }
138
+ )
139
+ if (tweets.take(limit).size >= limit) break
117
140
} while (cursor != null )
141
+ return tweets.take(limit) to friends.toList()
118
142
}
119
143
120
144
private fun getSingles (
@@ -130,15 +154,9 @@ class Crawler(
130
154
t.println (infoStyle(" Saving $username tweets" ))
131
155
val file = File (" ${Constants .DOWNLOAD_PATH } /$username .txt" )
132
156
133
- var bias = 0
134
- var headers: List <String >? = listOf (" #" , " tweet" )
135
- if (file.exists()) {
136
- bias = file.readText().split(" \n " ).size
137
- headers = null
138
- }
157
+ val headers = listOf (" #" , " tweet" )
139
158
140
159
val csv = csvOf(
141
- indexBias = bias,
142
160
headers = headers,
143
161
data = tweets,
144
162
itemBuilder = { index, item ->
0 commit comments