Skip to content

Commit 2ea3321

Browse files
committed
Added IMDB top movie scrapping
1 parent 6d9010f commit 2ea3321

File tree

3 files changed

+196
-47
lines changed

3 files changed

+196
-47
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,4 @@ Hopefully will work on older version too (as old as 5.10) and across diferent OS
139139
| Weather <ul><li>LWP::UserAgent</li></ul><ul><li>Mojo::UserAgent</li></ul> | Completed |
140140
| reCAPTCHA v2 | Completed |
141141
| reCAPTCHA v3 | Completed |
142+
| IMDB top movies | Completed |

web_programming/get_imdb_top.pl

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env perl
2+
# Search imdb top list of movies based on advance search - https://www.imdb.com/search/title/
3+
# User will provide how many top movies he want to see
4+
# Output will contain ll details about the movies
5+
6+
# https://metacpan.org/pod/Mojo::UserAgent
7+
# https://metacpan.org/pod/Mojo::DOM
8+
# https://metacpan.org/pod/Mojo::Util
9+
10+
use strict;
11+
use warnings;
12+
use Carp qw( croak );
13+
use Mojo::UserAgent;
14+
use Mojo::Util qw( trim );
15+
use Data::Dumper;
16+
17+
sub crawl_results {
18+
my ($ua, $url) = @_;
19+
my $response = $ua->get($url)->result;
20+
if ($response->is_success) {
21+
22+
# https://docs.mojolicious.org/Mojo/DOM#find
23+
my $movies_list = $response->dom->find('div.lister-item-content');
24+
my @top_movies_info;
25+
for my $div ($movies_list->each) {
26+
my %movie_info;
27+
my $movie_name = $div->find('h3.lister-item-header a')->map('text')->join("\n");
28+
my $release_year = $div->find('h3.lister-item-header span.lister-item-year')->map('text')->join("\n");
29+
$release_year =~ s/^\(|\)$//g;
30+
$movie_info{$movie_name}{"Released Year"} = $release_year;
31+
32+
# Explicitly converting to string
33+
$movie_info{$movie_name}{"Certificate"} = "" . $div->find('p span.certificate')->map('text')->join("\n");
34+
$movie_info{$movie_name}{"Runtime"} = "" . $div->find('p span.runtime')->map('text')->join("\n");
35+
$movie_info{$movie_name}{"Genre"} = trim $div->find('p span.genre')->map('text')->join("\n");
36+
37+
$movie_info{$movie_name}{"Rating"}
38+
= "" . $div->find('div.inline-block.ratings-imdb-rating strong')->map('text')->join("\n");
39+
$movie_info{$movie_name}{"Metascore"}
40+
= trim $div->find('div.inline-block.ratings-metascore span.metascore')->map('text')->join("\n");
41+
42+
$movie_info{$movie_name}{"Abstract"} = trim $div->find('p.text-muted')->map('text')->join("\n");
43+
44+
# Both 'votes' and 'gross' are contain inside 'span' having same 'name'. First is votes the gross
45+
# https://metacpan.org/pod/Mojo::DOM::CSS#SELECTORS
46+
$movie_info{$movie_name}{"Votes"}
47+
= "" . $div->find('p.sort-num_votes-visible span[name="nv"]')->map('text')->[0];
48+
$movie_info{$movie_name}{"Gross"}
49+
= "" . $div->find('p.sort-num_votes-visible span[name="nv"]')->map('text')->[1];
50+
51+
push @top_movies_info, \%movie_info;
52+
}
53+
print Dumper(\@top_movies_info);
54+
}
55+
else {
56+
croak $response->message;
57+
}
58+
}
59+
60+
sub main {
61+
print "\nHow many top movies you want to see: ";
62+
my $top_movies = <ARGV>;
63+
chomp($top_movies);
64+
my $url = "https://www.imdb.com/search/title/?title_type=feature&sort=num_votes,desc&count=" . $top_movies;
65+
my $ua = Mojo::UserAgent->new;
66+
$ua->transactor->name('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0');
67+
68+
crawl_results($ua, $url);
69+
}
70+
71+
main();
72+
73+
__END__
74+
75+
Output:
76+
77+
How many top movies you want to see: 3
78+
$VAR1 = [
79+
{
80+
'The Shawshank Redemption' => {
81+
'Gross' => '$28.34M',
82+
'Certificate' => 'A',
83+
'Runtime' => '142 min',
84+
'Rating' => '9.3',
85+
'Metascore' => '80',
86+
'Votes' => '2,334,720',
87+
'Genre' => 'Drama',
88+
'Released Year' => '1994',
89+
'Abstract' => 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'
90+
}
91+
},
92+
{
93+
'The Dark Knight' => {
94+
'Released Year' => '2008',
95+
'Votes' => '2,295,724',
96+
'Genre' => 'Action, Crime, Drama',
97+
'Abstract' => 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
98+
'Rating' => '9.0',
99+
'Runtime' => '152 min',
100+
'Gross' => '$534.86M',
101+
'Certificate' => 'UA',
102+
'Metascore' => '84'
103+
}
104+
},
105+
{
106+
'Inception' => {
107+
'Metascore' => '74',
108+
'Gross' => '$292.58M',
109+
'Certificate' => 'UA',
110+
'Rating' => '8.8',
111+
'Runtime' => '148 min',
112+
'Abstract' => 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O.',
113+
'Votes' => '2,059,688',
114+
'Genre' => 'Action, Adventure, Sci-Fi',
115+
'Released Year' => '2010'
116+
}
117+
}
118+
];

web_programming/web_crawling/crawl_google_scholar_citation.pl

Lines changed: 77 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -25,34 +25,51 @@ sub crawl_results {
2525
my $divs = $response->dom->find('div.gs_ri');
2626
my @publications;
2727
for my $div ($divs->each) {
28-
my $article = {};
28+
# For getting the title
29+
# https://docs.mojolicious.org/Mojo/Collection#map
30+
# https://docs.mojolicious.org/Mojo/Collection#join
2931
my $title = $div->find('h3.gs_rt a')->map('text')->join("\n");
3032
if (defined $title && $title ne "") {
33+
my $article = {};
3134
$article->{"Title"} = "$title";
32-
}
33-
my $abstract = $div->find('div.gs_rs')->map('text')->join("\n");
34-
if (defined $abstract && $abstract ne "") {
35-
$article->{"Abstract"} = "$abstract";
36-
}
37-
my $journal = $div->find('div.gs_a')->map('text')->map(
38-
sub {
39-
my ($aut_name, $journal) = split(/-/, $_, 2);
40-
$journal =~ s/^\s?//;
41-
return $journal;
35+
36+
my $anchor_link = $div->find('h3.gs_rt a')->map(attr => 'href')->join("\n");
37+
if (defined $anchor_link && $anchor_link ne "") {
38+
$article->{"Link"} = "$anchor_link";
39+
}
40+
41+
# For getting the abstract
42+
my $abstract = $div->find('div.gs_rs')->map('text')->join("\n");
43+
if (defined $abstract && $abstract ne "") {
44+
$article->{"Abstract"} = "$abstract";
45+
}
46+
47+
# For getting the journal name
48+
my $journal = $div->find('div.gs_a')->map('text')->map(
49+
# Remove the '-' and extra space from start
50+
sub {
51+
my ($aut_name, $journal) = split(/-/, $_, 2);
52+
$journal =~ s/^\s?//;
53+
return $journal;
54+
}
55+
)->join("\n");
56+
if (defined $journal && $journal ne "") {
57+
$article->{"Journal"} = "$journal";
4258
}
43-
)->join("\n");
44-
if (defined $journal && $journal ne "") {
45-
$article->{"Journal"} = "$journal";
46-
}
4759

48-
# https://docs.mojolicious.org/Mojo/Collection
49-
my $citation
50-
= $div->find('div.gs_fl a')->grep(sub { $_->text =~ /Cited by/ })->map('text')
60+
# For getting the citation
61+
# https://docs.mojolicious.org/Mojo/Collection#grep
62+
my $citation = $div->find('div.gs_fl a')->grep(
63+
# It contain the string like 'Cited by 5'
64+
sub { $_->text =~ /Cited by/ }
65+
)
66+
->map('text')
5167
->join("\n");
52-
if (defined $citation && $citation ne "") {
53-
$article->{"Citation"} = "$citation";
68+
if (defined $citation && $citation ne "") {
69+
$article->{"Citation"} = "$citation";
70+
}
71+
push(@publications, $article);
5472
}
55-
push(@publications, $article);
5673
}
5774
return \@publications;
5875
}
@@ -62,7 +79,6 @@ sub crawl_results {
6279
}
6380

6481
sub main {
65-
my $search_text = "Perl";
6682
my $base_url = "https://scholar.google.com/scholar";
6783
my $url = Mojo::URL->new($base_url);
6884

@@ -77,7 +93,7 @@ sub main {
7793
# "hl" => <Language of the result/output, "en" means english>,
7894
# "as_q" => "<Title of the article to search> author:<name>"
7995
# }
80-
# Getting all the article for a paricular author
96+
# Getting all the article for a particular author
8197
$url = $url->query({"as_q" => "author:\"kshama Rai\"", "hl" => "en"});
8298

8399
my $ua = Mojo::UserAgent->new;
@@ -94,45 +110,59 @@ sub main {
94110
95111
[
96112
{
97-
'Title' => 'Role of supplemental UV-B in changing the level of ozone toxicity in two cultivars of sunflower: growth, seed yield and oil quality',
98-
'Journal' => 'Ecotoxicology, 2019 - Springer',
99-
'Abstract' => "Abstract Ultraviolet-B radiation (UV-B) is inherent part of solar spectrum and tropospheric ozone (O 3) is a potent secondary air pollutant. Therefore the present study was conducted to evaluate the responses of Helianthus annuus L. cvs DRSF 108 and Sungold (sunflower)\x{a0}\x{2026}",
100-
'Citation' => 'Cited by 5'
113+
'Link' => 'https://link.springer.com/article/10.1007/s10646-019-02020-6',
114+
'Citation' => 'Cited by 5',
115+
'Title' => 'Role of supplemental UV-B in changing the level of ozone toxicity in two cultivars of sunflower: growth, seed yield and oil quality',
116+
'Journal' => 'Ecotoxicology, 2019 - Springer',
117+
'Abstract' =>
118+
"Abstract Ultraviolet-B radiation (UV-B) is inherent part of solar spectrum and tropospheric ozone (O 3) is a potent secondary air pollutant. Therefore the present study was conducted to evaluate the responses of Helianthus annuus L. cvs DRSF 108 and Sungold (sunflower)\x{a0}\x{2026}"
101119
},
102120
{
121+
'Link' => 'http://www.bhu.ac.in/research_pub/jsr61/_pdf_files/06.%20Ksharma%20Rai%20&%20SB%20Agrawal.pdf',
103122
'Title' => 'Effects of UV-B radiation on morphological, physiological and biochemical aspects of plants: an overview',
104-
'Journal' => 'J Sci Res, 2017 - bhu.ac.in',
105-
'Citation' => 'Cited by 10',
106-
'Abstract' => "Origin of life was never be thought without considering the role of UV radiation but once the \x{201c}boon\x{201d}, is
107-
slowly becoming \x{201c}curse\x{201d} for life. Plants are exposed to many factors but the problem of enhanced UV-B is created by the anthropogenic activities resulted in ozone layer\x{a0}\x{2026}"
123+
'Citation' => 'Cited by 11',
124+
'Abstract' =>
125+
"Origin of life was never be thought without considering the role of UV radiation but once the \x{201c}boon\x{201d}, is slowly becoming \x{201c}curse\x{201d} for life. Plants are exposed to many factors but the problem of enhanced UV-B is created by the anthropogenic activities resulted in ozone layer\x{a0}\x{2026}",
126+
'Journal' => 'J Sci Res, 2017 - bhu.ac.in'
108127
},
109128
{
110-
'Abstract' => "In the present study sensitivity of a medicinal plant Eclipta alba L.(Hassk)(False daisy) was assessed under intermittent (IT) and continuous (CT) doses of elevated ultraviolet-B (eUV-B). Eclipta alba is rich in medicinally important phytochemical constituents, used against\x{a0}\x{2026}",
111-
'Journal' => 'Physiology and Molecular Biology of Plants, 2020 - Springer',
112-
'Title' => "Effect on essential oil components and wedelolactone content of a medicinal plant Eclipta alba due to modifications in the growth and morphology under different\x{a0}\x{2026}"
129+
'Journal' => 'Physiology and Molecular Biology of Plants, 2020 - Springer',
130+
'Abstract' =>
131+
"In the present study sensitivity of a medicinal plant Eclipta alba L.(Hassk)(False daisy) was assessed under intermittent (IT) and continuous (CT) doses of elevated ultraviolet-B (eUV-B). Eclipta alba is rich in medicinally important phytochemical constituents, used against\x{a0}\x{2026}",
132+
'Title' => "Effect on essential oil components and wedelolactone content of a medicinal plant Eclipta alba due to modifications in the growth and morphology under different\x{a0}\x{2026}",
133+
'Link' => 'https://link.springer.com/content/pdf/10.1007/s12298-020-00780-8.pdf'
113134
},
114135
{
136+
'Link' => 'https://www.sciencedirect.com/science/article/pii/B9780128164839000050',
137+
'Abstract' =>
138+
"Climate change is associated to how weather patterns change over decades or longer due to natural and human influences. Since the industrial revolution, humans have contributed to climate change through the emission of greenhouse gases and aerosols as well as changes\x{a0}\x{2026}",
139+
'Journal' => "Climate Change and Agricultural\x{a0}\x{2026}, 2019 - Elsevier",
115140
'Citation' => 'Cited by 1',
116-
'Abstract' => "Climate change is associated to how weather patterns change over decades or longer due to natural and human influences. Since the industrial revolution, humans have contributed to climate change through the emission of greenhouse gases and aerosols as well as changes\x{a0}\x{2026}",
117-
'Title' => 'Climate Change and Secondary Metabolism in Plants: Resilience to Disruption',
118-
'Journal' => "Climate Change and Agricultural\x{a0}\x{2026}, 2019 - Elsevier"
141+
'Title' => 'Climate Change and Secondary Metabolism in Plants: Resilience to Disruption'
119142
},
120143
{
121144
'Title' => 'HOST PATHOGEN INTERACTIONS BETWEEN DROSOPHILA MELANOGASTER AND BEAUVERIA BASSIANA _ A Thesis Presented to the',
122-
'Journal' => '2019 - search.proquest.com',
123-
'Abstract' => "Drosophila melanogaster is an established model organism for immunity as their immune system is similar to insect disease vectors and pests and also shares similarities with that of the mammalian innate immune system. Our study uses the entomopathogenic fungus\x{a0}\x{2026}"
145+
'Journal' => '2019 - search.proquest.com',
146+
'Abstract' =>
147+
"Drosophila melanogaster is an established model organism for immunity as their immune system is similar to insect disease vectors and pests and also shares similarities with that of the mammalian innate immune system. Our study uses the entomopathogenic fungus\x{a0}\x{2026}",
148+
'Link' => 'http://search.proquest.com/openview/868d2826bca7969ea2c29d15273af87b/1.pdf?pq-origsite=gscholar&cbl=18750&diss=y'
124149
},
125150
{
126-
'Abstract' => "Page 1. i \x{201c}LOW WEIGHT GAIN AS A PREDICTOR FOR DEVELOPMENT OF RETINOPATHY
151+
'Title' => 'Low weight gain as a predictor for development of retinopathy of prematurity',
152+
'Abstract' =>
153+
"Page 1. i \x{201c}LOW WEIGHT GAIN AS A PREDICTOR FOR DEVELOPMENT OF RETINOPATHY
127154
OF PREMATURITY\x{201d} By Dr. KSHAMA RAI MBBS Dissertation Submitted to the Rajiv Gandhi
128155
University of Health Sciences, Karnataka, Bangalore In partial fulfilment of the requirements\x{a0}\x{2026}",
129-
'Title' => 'Low weight gain as a predictor for development of retinopathy of prematurity',
130-
'Journal' => '2018 - 112.133.228.240'
156+
'Journal' => '2018 - 112.133.228.240',
157+
'Link' => 'http://112.133.228.240/xmlui/bitstream/handle/123456789/1156/Synopsis.pdf?sequence=1'
131158
},
132159
{
133-
'Journal' => 'gyanvihar.org',
134-
'Title' => 'Use of High Resolution Remote Sensing Data and GIS Techniques for Monitoring Of \'U\'Shaped Wetland At GB Nagar District,
135-
Uttar Pradesh',
136-
'Abstract' => "In developing countries of the world, the ever increasing population and to fulfill its need for housing and other economic activities almost urban fringe are getting encroached and our surrounding environment and natural wetlands, water bodies and other biological cycles are\x{a0}\x{2026}"
160+
'Title' => 'Use of High Resolution Remote Sensing Data and GIS Techniques for Monitoring Of \'U\'Shaped Wetland At GB Nagar District, Uttar Pradesh',
161+
'Journal' => 'gyanvihar.org',
162+
'Abstract' =>
163+
"In developing countries of the world, the ever increasing population and to fulfill its need for housing and other economic activities almost urban fringe are getting encroached and our surrounding environment and natural wetlands, water bodies and other biological cycles are\x{a0}\x{2026}",
164+
'Link' =>
165+
'https://www.gyanvihar.org/researchjournals/c3w/Chapter-2%20Use%20Of%20High%20Resolution%20Remote%20Sensing%20Data%20And%20GIS%20Techniques%20For%20Monitoring%20Of%20_U_%20Shaped%20Wetland%20At%20G.B.%20Nagar%20District,%20Uttar%20Pradesh.pdf'
137166
}
138-
]
167+
];
168+

0 commit comments

Comments
 (0)