-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathscript_processor.py
More file actions
125 lines (113 loc) · 4.37 KB
/
script_processor.py
File metadata and controls
125 lines (113 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import requests
from bs4 import BeautifulSoup
import omdb
import json
files = os.listdir('NLP-Movie_Scripts/scripts/')
#Create a a dict where each item contains information on a movie in dictionary format
movies = {}
for each in files:
movie = {}
if '-The_script.txt' in each:
movies['The '+each.replace('-The_script.txt','').replace('-', ' ')[:-1]] = {'filename':each}
else:
movies[each.replace('_script.txt','').replace('-', ' ')] = {'filename':each}
#fetch imdb data of a movie using OMDB api, please read OMDB doc for further details
def getOmdbData(movie):
api_key = 'eb6547ff'
omdb.set_default('apikey', api_key)
return omdb.get(title=movie, fullplot=True, tomatoes=True)
def prepURL(imdb_id):
#prepare the url from which we will scrape the cast data
url = 'https://www.imdb.com/title/'+imdb_id #+'/fullcredits?ref_=tt_cl_sm#cast'
return url
def getCharacters(url):
#Only first imdb page of the cast members are considered.
website_url = requests.get(url).text
soup = BeautifulSoup(website_url, "lxml")
#upon inspecting the html code, we see that the cast is structured in <table> tags
table = soup.find('table',{'class':'cast_list'})
#<tr> is a tag for table row [tr]
cast = table.find_all('tr')
characters = []
for each in cast:
temp = each.find_all('td')[-1].text.strip().replace('\n','').replace(' ','')
if 'uncredited' in temp:
characters.append(temp.split('(')[0][:-1])
else:
characters.append(temp)
characters = list(set(characters))
return characters
for movie in movies.keys():
#for the given movie name we fetch the imdb data
x = getOmdbData(movie)
try:
movies[movie]['imdb_id'] = x['imdb_id']
except:
continue
#We need to delete the movie's on which there isn't much data available
delete = []
for movie in movies.keys():
try:
url = prepURL(movies[movie]['imdb_id'])
characters = getCharacters(url)
movies[movie]['imdb_url'] = url
movies[movie]['characters'] = characters
except:
delete.append(movie)
#Delete the movies on which we don't have necessary data on
for movie in delete:
del movies[movie]
#Clean the data further
for movie in movies.keys():
if 'Cast overview, first billed only:' in movies[movie]['characters']:
movies[movie]['characters'].remove('Cast overview, first billed only:')
#Alternate Approach which works equally well.
"""
script = open('NLP-Movie_Scripts/scripts/'+file,'r')
for line in script:
temp = line.strip()
if 'thor'.upper() in temp:
if (len(temp.split(' ')) <= 2) & (temp.split(' ')[0]=='thor'.upper()):
print(temp)
"""
for movie in movies.keys():
script = open('NLP-Movie_Scripts/scripts/'+movies[movie]['filename'],'r').read()
#create a string containing character names seperated by commas
characters = ','.join([','.join(name.split(' ')) for name in movies[movie]['characters'] if len(name.split(' ')) <3])
# data structure to store the dialogues
dialogues = {}
for ch in characters.split(','):
if 'Mr.' in ch or '-' in ch:
continue
else:
name = ch
dialogue = ''
flag = 0
count = 0
for line in script.split("\n"):
temp = line.strip(" ").replace(line[line.find("("):line.find(")") + 1], "").strip(" ")
if temp.isupper(): # If Character Intro
if name.upper() in temp:
flag = 1
count += 1
dialogue += '\n['+str(count)+']'
continue
else: # If it's another character
flag = 0
continue
if flag:
if flag == 1:
ident_level_first = len(line) - len(line.lstrip(" "))
ident_level = len(line) - len(line.lstrip(" "))
if ident_level == ident_level_first:
dialogue += temp + " "
else:
continue
flag += 1
dialogues[name] = dialogue
movies[movie]['dialogues'] = dialogues
#information available on each movie ---- dict_keys(['filename', 'imdb_id', 'imdb_url', 'characters', 'dialogues'])
for movie in movies.keys():
with open(movie.replace(' ','_')+'.json', 'w') as fp:
json.dump(movies[movie], fp)