core.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. # -*- coding: utf-8 -*-
  2. """
  3. Direct Messages Archiver
  4. Usage:
  5. >>> from dmarchiver.core import Crawler
  6. >>> crawler = Crawler()
  7. >>> crawler.authenticate('username', 'password')
  8. >>> crawler.crawl('conversation_id')
  9. """
  10. import collections
  11. import datetime
  12. from enum import Enum
  13. import os
  14. import pickle
  15. import re
  16. import shutil
  17. from sys import platform
  18. import time
  19. import lxml.html
  20. import requests
  21. import traceback
  22. from ratelimit import limits
  23. import random
  24. from json import dump as json_dump
  25. API_LIMIT = 900
  26. API_RESET = 900
  27. DEFAULT_BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
  28. __all__ = ['Crawler']
  29. # Expand short URL generated by Twitter
  30. def expand_url(url):
  31. """Return the expanded URL behind a short link"""
  32. response = requests.get(url, allow_redirects=False)
  33. return response.headers['location']
  34. class Conversation(object):
  35. """This class is a representation of a complete conversation"""
  36. conversation_id = None
  37. tweets = collections.OrderedDict()
  38. def __init__(self, conversation_id):
  39. self.tweets = collections.OrderedDict()
  40. self.conversation_id = conversation_id
  41. def print_conversation(self):
  42. """Print the conversation in the console"""
  43. items = list(self.tweets.items())
  44. items.reverse()
  45. for tweet in items:
  46. if type(tweet[1]).__name__ == 'DirectMessage':
  47. irc_formatted_date = datetime.datetime.fromtimestamp(
  48. int(tweet[1].time_stamp)).strftime('%Y-%m-%d %H:%M:%S')
  49. print(
  50. '[{0}] <{1}> '.format(
  51. irc_formatted_date,
  52. tweet[1].author),
  53. end='')
  54. for element in tweet[1].elements:
  55. print('{0} '.format(element), end='')
  56. print('\r')
  57. elif type(tweet[1]).__name__ == 'DMConversationEntry':
  58. print('[DMConversationEntry] {0}\r'.format(tweet[1]))
  59. def write_conversation(self, filename, max_id):
  60. """Write the content of the conversation to a file"""
  61. file_buffer = ''
  62. items = list(self.tweets.items())
  63. items.reverse()
  64. for tweet in items:
  65. if type(tweet[1]).__name__ == 'DirectMessage':
  66. irc_formatted_date = datetime.datetime.fromtimestamp(
  67. int(tweet[1].time_stamp)).strftime('%Y-%m-%d %H:%M:%S')
  68. file_buffer += '[{0}] <{1}> '.format(
  69. irc_formatted_date, tweet[1].author)
  70. for element in tweet[1].elements:
  71. # Convert all '\n' of the buffer to os.linesep
  72. # to handle tweets on multiple lines
  73. file_buffer += '{0} '.format(
  74. element).replace('\n', os.linesep)
  75. # Remove the last space of the line
  76. file_buffer = file_buffer[:-1]
  77. # Add the end of line character
  78. file_buffer += '{0}'.format(os.linesep)
  79. elif type(tweet[1]).__name__ == 'DMConversationEntry':
  80. file_buffer += '[DMConversationEntry] {0}{1}'.format(
  81. tweet[1], os.linesep)
  82. # Write the latest tweet ID to allow incremental updates
  83. if len(items) > 0:
  84. file_buffer += '[LatestTweetID] {0}{1}'.format(
  85. tweet[1].tweet_id, os.linesep)
  86. if max_id != '0':
  87. with open(filename, 'rb+') as file:
  88. lines = file.readlines()
  89. # Remove last line and rewrite the file (poor
  90. # performance...)
  91. lines = lines[:-1]
  92. file.seek(0)
  93. file.write(b''.join(lines))
  94. file.truncate()
  95. file_mode = "ab"
  96. if max_id == '0':
  97. file_mode = "wb"
  98. with open(filename, file_mode) as file:
  99. file.write(file_buffer.encode('UTF-8'))
  100. class DMConversationEntry(object):
  101. """This class is a representation of a DMConversationEntry.
  102. It could be a when a new user join the group, when
  103. the group is renamed or the picture updated.
  104. """
  105. tweet_id = ''
  106. _text = ''
  107. def __init__(self, tweet_id, text):
  108. self.tweet_id = tweet_id
  109. self._text = text.strip()
  110. def __str__(self):
  111. return self._text
  112. class DirectMessage(object):
  113. """This class is a representation of a Direct Message (a tweet)"""
  114. tweet_id = ''
  115. time_stamp = ''
  116. author = ''
  117. elements = []
  118. def __init__(self, tweet_id, time_stamp, author):
  119. self.tweet_id = tweet_id
  120. self.time_stamp = time_stamp
  121. self.author = author
  122. class DirectMessageText(object):
  123. """ This class is a representation of simple text message.
  124. This is an "element" of the Direct Message.
  125. """
  126. _text = ''
  127. def __init__(self, text):
  128. self._text = text
  129. def __str__(self):
  130. return self._text
  131. class DirectMessageTweet(object):
  132. """ This class is a representation of a quoted tweet.
  133. This is an "element" of the Direct Message.
  134. """
  135. _tweet_url = ''
  136. def __init__(self, tweet_url):
  137. self._tweet_url = tweet_url
  138. def __str__(self):
  139. return '[Tweet] {0}'.format(self._tweet_url)
  140. class MediaType(Enum):
  141. """ This class is a representation of the possible media types."""
  142. image = 1
  143. gif = 2
  144. video = 3
  145. sticker = 4
  146. unknown = 5
  147. class DirectMessageMedia(object):
  148. """ This class is a representation of a embedded media.
  149. This is an "element" of the Direct Message.
  150. """
  151. _media_preview_url = ''
  152. _media_url = ''
  153. _media_alt = ''
  154. _media_type = ''
  155. _media_replace_url = ''
  156. def __init__(self, media_url, media_preview_url, media_type, media_replace_url):
  157. self._media_url = media_url
  158. self._media_preview_url = media_preview_url
  159. self._media_type = media_type
  160. self._media_replace_url = media_replace_url
  161. def __repr__(self):
  162. # Todo
  163. return "{0}('{1}','{2}','{3}')".format(
  164. self.__class__.__name__,
  165. self._media_url,
  166. self._media_preview_url,
  167. self._media_replace_url)
  168. def __str__(self):
  169. if self._media_preview_url != '':
  170. return '[Media-{0}] {1} [Media-preview] {2}'.format(
  171. self._media_type.name, self._media_url, self._media_preview_url)
  172. else:
  173. return '[Media-{0}] {1}'.format(
  174. self._media_type.name, self._media_url)
  175. class Crawler(object):
  176. """ This class is a main component of the tool.
  177. It allows to create an authentication session,
  178. retrieve the conversation list and loop to gather all the tweets.
  179. """
  180. _twitter_base_url = 'https://twitter.com'
  181. _referer_url = 'https://twitter.com/messages/{}'
  182. _bearer_token_url = 'https://abs.twimg.com/responsive-web/client-web/main.05e1f885.js'
  183. _api_url = 'https://api.twitter.com'
  184. _user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
  185. if platform == 'darwin':
  186. _user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/603.1.13 (KHTML, like Gecko) Version/10.1 Safari/603.1.13'
  187. elif platform == 'linux' or platform == 'linux2':
  188. _user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
  189. _http_headers = {
  190. 'User-Agent': _user_agent}
  191. _login_headers = {
  192. 'User-Agent': _user_agent,
  193. 'Referer': 'https://mobile.twitter.com/login'}
  194. _ajax_headers = {
  195. 'user-agent': _user_agent,
  196. 'accept': '*/*',
  197. 'accept-encoding': 'gzip, deflate, br',
  198. 'referer': 'https://mobile.twitter.com',
  199. 'x-twitter-active-user': 'yes',
  200. 'origin': 'https://twitter.com',
  201. 'accept-language': 'en-US,en-GB;q=0.9,en;q=0.8'}
  202. _api_headers = {
  203. 'User-Agent': _user_agent,
  204. 'Accept': '*/*',
  205. 'Accept-Encoding': 'gzip, deflate, br',
  206. 'Accept-Language': 'en-US,en-GB;q=0.9,en;q=0.8',
  207. 'Origin': 'https://twitter.com',
  208. 'Sec-Fetch-Dest': 'empty',
  209. 'Sec-Fetch-Site': 'same-site',
  210. 'X-Twitter-Active-User': 'yes',
  211. 'X-Twitter-Auth-Type': 'OAuth2Session',
  212. 'X-Twitter-Client-Language': 'en'}
  213. _max_id_found = False
  214. _session = None
  215. def authenticate(self, username, password, save_session, raw_output, mfa_token=None):
  216. force_nojs = 'https://mobile.twitter.com/i/nojs_router?path=%2Flogin'
  217. login_url = 'https://mobile.twitter.com/login'
  218. mfa_url = 'https://mobile.twitter.com/account/login_verification'
  219. sessions_url = 'https://mobile.twitter.com/sessions'
  220. messages_url = self._twitter_base_url + '/messages'
  221. if save_session:
  222. try:
  223. with open('dmarchiver_session.dat', 'rb') as file:
  224. self._session = pickle.load(file)
  225. print('dmarchiver_session.dat found. Reusing a previous session, ignoring the provided credentials.')
  226. # Test if the session is still valid
  227. response = self._session.get(messages_url, headers=self._http_headers, allow_redirects=False)
  228. if response.status_code == 200:
  229. return
  230. else:
  231. self._session = None
  232. print('Previous session is invalid. Creating a new session with provided credentials.')
  233. except FileNotFoundError:
  234. print('dmarchiver_session.dat not found. Creating a new session with provided credentials.')
  235. if save_session is False or self._session is None:
  236. self._session = requests.Session()
  237. if raw_output:
  238. raw_output_file = open(
  239. 'authentication-{0}.txt'.format(username), 'wb')
  240. response = self._session.post(
  241. force_nojs,
  242. headers=self._login_headers)
  243. if raw_output:
  244. raw_output_file.write(response.content)
  245. raw_output_file.close()
  246. document = lxml.html.document_fromstring(response.content)
  247. authenticity_token = document.xpath(
  248. '//input[@name="authenticity_token"]/@value')[0]
  249. payload = {'session[username_or_email]': str(username),
  250. 'session[password]': password,
  251. 'authenticity_token': authenticity_token}
  252. response = self._session.post(
  253. sessions_url,
  254. headers=self._ajax_headers,
  255. params=payload)
  256. if mfa_token is not None and 'auth_token' not in dict(self._session.cookies):
  257. document = lxml.html.document_fromstring(response.content)
  258. challenge_id = document.xpath('//input[@name="challenge_id"]/@value')[0]
  259. user_id = document.xpath('//input[@name="user_id"]/@value')[0]
  260. payload = {
  261. 'challenge_type': 'Totp',
  262. 'user_id': user_id,
  263. 'platform': 'web',
  264. 'challenge_response': str(mfa_token),
  265. 'challenge_id': challenge_id,
  266. 'authenticity_token': authenticity_token}
  267. response = self._session.post(mfa_url, headers=self._ajax_headers, params=payload)
  268. cookies = requests.utils.dict_from_cookiejar(self._session.cookies)
  269. if 'auth_token' in cookies:
  270. print('Authentication succeedeed.{0}'.format(os.linesep))
  271. if save_session:
  272. # Saving the session locally
  273. with open('dmarchiver_session.dat', "wb") as file:
  274. pickle.dump(self._session, file)
  275. else:
  276. raise PermissionError(
  277. 'Your username or password was invalid. Note: DMArchiver supports multi-factor authentication (provided at command-line), but not application passwords.')
  278. def _get_bearer_token(self):
  279. try:
  280. response = self._session.get(self._bearer_token_url)
  281. return 'Bearer {}'.format(re.findall('(AAAAAA.*?)\"',str(response.content))[0])
  282. except:
  283. return 'Bearer {}'.format(DEFAULT_BEARER_TOKEN)
  284. def _cookie_string(self):
  285. cookies = dict(self._session.cookies)
  286. csrf_token = ''.join(random.choice('1234567890abcdef') for i in range(32))
  287. cookies['ct0'] = csrf_token
  288. self._api_headers['x-csrf-token'] = csrf_token
  289. self._api_headers['Authorization'] = self._get_bearer_token()
  290. return "; ".join([str(x)+"="+str(y) for x,y in cookies.items()])
  291. def get_threads(self, delay, raw_output):
  292. threads = []
  293. messages_url = self._twitter_base_url + '/messages'
  294. payload = {}
  295. first_request = False
  296. if raw_output:
  297. raw_output_file = open(
  298. 'conversation-list.txt', 'wb')
  299. while True:
  300. response = self._session.get(
  301. messages_url,
  302. headers=self._ajax_headers,
  303. params=payload)
  304. if raw_output:
  305. raw_output_file.write(response.content)
  306. json = response.json()
  307. if 'errors' in json:
  308. print('An error occured during the parsing of the conversions.\n')
  309. if json['errors'][0]['code'] == 326:
  310. print('''DMArchiver was identified as suspicious and your account as been temporarily locked by Twitter.
  311. Don\'t worry, you can unlock your account by following the intructions on the Twitter website.
  312. Maybe it\'s the first time you use it or maybe you have a lot of messages.
  313. You can unlock your account and try again, and possibly use the -d option to slow down the tool.\n''')
  314. print('''Twitter error details below:
  315. Code {0}: {1}\n'''.format(json['errors'][0]['code'], json['errors'][0]['message']))
  316. raise Exception('Stopping execution due to parsing error while retrieving the conversations')
  317. try:
  318. if first_request is False:
  319. first_request = True
  320. threads += json['inner']['trusted']['threads']
  321. if json['inner']['trusted']['has_more'] is False:
  322. break
  323. payload = {'is_trusted': 'true', 'max_entry_id': json[
  324. 'inner']['trusted']['min_entry_id']}
  325. messages_url = self._twitter_base_url + '/inbox/paginate?is_trusted=true&max_entry_id=' + \
  326. json['inner']['trusted']['min_entry_id']
  327. else:
  328. if json['trusted']['is_empty'] is True:
  329. break
  330. threads += json['trusted']['threads']
  331. if json['trusted']['has_more'] is False:
  332. break
  333. payload = {'is_trusted': 'true',
  334. 'max_entry_id': json['trusted']['min_entry_id']}
  335. messages_url = self._twitter_base_url + '/inbox/paginate?is_trusted=true&max_entry_id=' + \
  336. json['trusted']['min_entry_id']
  337. except KeyError as ex:
  338. print(
  339. 'Unable to fully parse the list of the conversations.\n \
  340. Maybe your account is locked or Twitter has updated the HTML code.\n \
  341. Use -r to get the raw output and post an issue on GitHub.\n \
  342. Exception: {0}'.format(str(ex)))
  343. break
  344. time.sleep(delay)
  345. if raw_output:
  346. raw_output_file.close()
  347. return threads
  348. def _get_latest_tweet_id(self, thread_id):
  349. filename = '{0}.txt'.format(thread_id)
  350. try:
  351. with open(filename, 'rb+') as file:
  352. lines = file.readlines()
  353. regex = r"^\[LatestTweetID\] ([0-9]+)"
  354. result = re.match(regex, lines[-1].decode('utf-8'))
  355. if result:
  356. print('Latest tweet ID found in previous dump. Incremental update.')
  357. return result.group(1)
  358. else:
  359. print(
  360. 'Latest tweet ID not found in previous dump. Creating a new one with incremental support.')
  361. except IOError:
  362. print(
  363. "Previous conversation not found. Creating a new one with incremental support.")
  364. return '0'
  365. def _get_media_url(self, variants):
  366. return sorted(variants, key = lambda i: i['bitrate'] if 'bitrate' in i else -1, reverse=True)[0]['url']
  367. def _parse_dm_media(self, type, media, tweet_id, time_stamp, download):
  368. media_url = ''
  369. media_preview_url = ''
  370. media_alt = ''
  371. media_replace_url = ''
  372. media_type = MediaType.unknown
  373. formatted_timestamp = datetime.datetime.fromtimestamp(
  374. int(time_stamp)).strftime('%Y%m%d-%H%M%S')
  375. self._session.headers.update({'Referer': 'https://twitter.com/?lang=en'})
  376. media_replace_url = media['expanded_url']
  377. if type == 'photo':
  378. media_url = media['media_url_https']
  379. media_filename_re = re.findall(r'/\d+/(.+)/(.+)$', media_url)
  380. media_sticker_filename_re = re.findall(
  381. '/stickers/stickers/(.+)$', media_url)
  382. if len(media_filename_re) > 0:
  383. media_type = MediaType.image
  384. media_filename = '{0}-{1}-{2}-{3}'.format(
  385. formatted_timestamp, tweet_id, media_filename_re[0][0], media_filename_re[0][1])
  386. elif len(media_sticker_filename_re) > 0:
  387. # It is a sticker
  388. media_type = MediaType.sticker
  389. media_filename = 'sticker-' + media_sticker_filename_re[0]
  390. else:
  391. # Unknown media type
  392. print("Unknown media type")
  393. if media_filename is not None and download:
  394. response = self._session.get(media_url, headers=self._api_headers, stream=True)
  395. while response.status_code == 429:
  396. time.sleep(60)
  397. response = self._session.get(media_url, headers=self._api_headers, stream=True)
  398. if response.status_code == 200:
  399. os.makedirs(
  400. '{0}/images'.format(self._conversation_id), exist_ok=True)
  401. with open('{0}/images/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
  402. file.write(response.content)
  403. elif type == 'animated_gif':
  404. media_type = MediaType.gif
  405. media_preview_url = media['media_url_https']
  406. media_url = self._get_media_url(media['video_info']['variants'])
  407. media_filename_re = re.findall(r'dm_gif/(.+)/(.+)$', media_url)
  408. media_filename = '{0}-{1}-{2}'.format(formatted_timestamp, media_filename_re[0][
  409. 0], media_filename_re[0][1])
  410. if download:
  411. response = self._session.get(media_url, stream=True)
  412. if response.status_code == 200:
  413. os.makedirs(
  414. '{0}/mp4-gifs'.format(self._conversation_id), exist_ok=True)
  415. with open('{0}/mp4-gifs/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
  416. file.write(response.content)
  417. elif type == 'video':
  418. media_type = MediaType.video
  419. media_preview_url = media['media_url_https']
  420. media_url = self._get_media_url(media['video_info']['variants'])
  421. media_filename = '{0}-{1}.mp4'.format(
  422. formatted_timestamp, tweet_id)
  423. if download:
  424. response = self._session.get(media_url, stream=True)
  425. if response.status_code == 200:
  426. os.makedirs(
  427. '{0}/mp4-videos'.format(self._conversation_id), exist_ok=True)
  428. with open('{0}/mp4-videos/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
  429. file.write(response.content)
  430. else:
  431. print('Unknown media')
  432. return DirectMessageMedia(media_url, media_preview_url, media_type, media_replace_url)
  433. def _process_tweets(self, tweets, users, download, max_id):
  434. conversation_set = {}
  435. for tweet_container in tweets:
  436. try:
  437. for type, t in tweet_container.items():
  438. tweet_type = type
  439. tweet_id = t['id']
  440. tweet = t
  441. if tweet_id == max_id:
  442. self._max_id_found = True
  443. print('Previous tweet limit found.')
  444. break
  445. time_stamp = tweet['time'][:10]
  446. if tweet_type == 'conversation_name_update':
  447. dm_author = tweet['by_user_id']
  448. dm_author_name = users[dm_author]['screen_name']
  449. text = '{} changed the group name to {}'.format(
  450. dm_author_name,
  451. tweet['conversation_name'])
  452. dm_author_name = 'DMConversationEntry'
  453. elif tweet_type == 'join_conversation' or tweet_type == 'participants_join':
  454. dm_author = tweet['sender_id']
  455. dm_author_name = users[dm_author]['screen_name']
  456. joiners = [users[user['user_id']]['screen_name'] for user in tweet['participants']]
  457. text = '{} added {}.'.format(dm_author_name, ', '.join(joiners))
  458. dm_author_name = 'DMConversationEntry'
  459. elif tweet_type == 'leave_conversation' or tweet_type == 'participants_leave':
  460. leavers = [users[user['user_id']]['screen_name'] for user in tweet['participants']]
  461. text = '{} left.'.format(', '.join(leavers))
  462. dm_author_name = 'DMConversationEntry'
  463. elif tweet_type == 'message':
  464. dm_author = tweet['message_data']['sender_id']
  465. dm_author_name = users[dm_author]['screen_name']
  466. msg = tweet['message_data']
  467. text = msg['text']
  468. if 'entities' in msg and 'urls' in msg['entities']:
  469. for url in msg['entities']['urls']:
  470. text = text.replace(url['url'], url['expanded_url'])
  471. if 'attachment' in msg:
  472. for k, v in msg['attachment'].items():
  473. if k == 'tweet':
  474. element = DirectMessageTweet(v['expanded_url'])
  475. text = text.replace(element._tweet_url, str(element))
  476. else:
  477. element = self._parse_dm_media(k, v, tweet_id, time_stamp, download[k])
  478. text = text.replace(element._media_replace_url, str(element))
  479. else: # unknown type
  480. raise Exception
  481. message = DirectMessage(tweet_id, time_stamp, dm_author_name)
  482. message.elements = [DirectMessageText(text)]
  483. except KeyboardInterrupt:
  484. print(
  485. 'Script execution interruption requested. Writing the conversation.')
  486. self._max_id_found = True
  487. break
  488. except Exception as ex:
  489. print(
  490. 'Unexpected error \'{0}\' for tweet \'{1}\', raw JSON will be used for the tweet.'.format(ex, tweet_id))
  491. traceback.print_exc()
  492. message = DMConversationEntry(
  493. tweet_id, '[ParseError] Parsing of tweet \'{0}\' failed. Raw JSON: {1}'.format(
  494. tweet_id, tweet))
  495. if message is not None:
  496. conversation_set[tweet_id] = message
  497. return conversation_set
  498. @limits(calls=API_LIMIT, period=API_RESET)
  499. def _api_call(self, url, headers, payload):
  500. return self._session.get(url, headers=headers, params=payload)
  501. def crawl(
  502. self,
  503. conversation_id,
  504. delay=0,
  505. download_images=False,
  506. download_gifs=False,
  507. download_videos=False,
  508. raw_output=False):
  509. raw_output_file = None
  510. if raw_output:
  511. raw_output_file = open(
  512. '{0}-raw.txt'.format(conversation_id), 'wb')
  513. print('{0}Starting crawl of \'{1}\''.format(
  514. os.linesep, conversation_id))
  515. # Attempt to find the latest tweet id of a previous crawl session
  516. max_id = self._get_latest_tweet_id(conversation_id)
  517. payload = {}
  518. self._conversation_id = conversation_id
  519. conversation = Conversation(conversation_id)
  520. conversation_url = '{}/1.1/dm/conversation/{}.json'.format(self._api_url, conversation_id)
  521. self._api_headers['referer'] = self._referer_url.format(conversation_id)
  522. self._api_headers['cookie'] = self._cookie_string()
  523. processed_tweet_counter = 0
  524. try:
  525. while True and self._max_id_found is False:
  526. response = self._api_call(conversation_url, self._api_headers, payload)
  527. json = response.json()
  528. if 'conversation_timeline' not in json:
  529. print('An error occured during the parsing of the tweets.\n')
  530. if json['errors'][0]['code'] == 326:
  531. print('''DMArchiver was identified as suspicious and your account as been temporarily locked by Twitter.
  532. Don\'t worry, you can unlock your account by following the intructions on the Twitter website.
  533. Maybe it\'s the first time you use it or maybe you have a lot of messages.
  534. You can unlock your account and try again, and possibly use the -d option to slow down the tool.\n''')
  535. print('''Twitter error details below:
  536. Code {0}: {1}\n'''.format(json['errors'][0]['code'], json['errors'][0]['message']))
  537. raise Exception('Stopping execution due to parsing error while retrieving the tweets.')
  538. json = json['conversation_timeline']
  539. payload = {'max_id': json['min_entry_id']}
  540. tweets = json['entries']
  541. users = json['users']
  542. if raw_output:
  543. json_dump(json, raw_output_file)
  544. # Get tweets for the current request
  545. conversation_set = self._process_tweets(
  546. tweets, users,
  547. {'photo': download_images, 'animated_gif': download_gifs, 'video': download_videos},
  548. max_id)
  549. # Append to the whole conversation
  550. for tweet_id in conversation_set:
  551. processed_tweet_counter += 1
  552. conversation.tweets[tweet_id] = conversation_set[tweet_id]
  553. print('Processed tweets: {0}\r'.format(
  554. processed_tweet_counter), end='')
  555. if json['status'] == 'AT_END':
  556. print('Begin of thread reached')
  557. break
  558. time.sleep(delay)
  559. except KeyboardInterrupt:
  560. print(
  561. 'Script execution interruption requested. Writing this conversation.')
  562. if raw_output:
  563. raw_output_file.close()
  564. print('Total processed tweets: {0}'.format(processed_tweet_counter))
  565. # print('Printing conversation')
  566. # conversation.print_conversation()
  567. print('Writing conversation to {0}.txt'.format(
  568. os.path.join(os.getcwd(), conversation_id)))
  569. conversation.write_conversation(
  570. '{0}.txt'.format(conversation_id), max_id)
  571. self._max_id_found = False