douyin_crawler.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # -*- coding: utf-8 -*-
  2. # @Author : lihuiwen
  3. # @file : douyin_crawler
  4. # @Email : huiwennear@163.com
  5. # @Time : 2024/5/23 16:58
  6. """
  7. 抖音评论爬取
  8. """
  9. from utils.common_utils import CommonUtils
  10. import copy
  11. import json
  12. import requests
  13. from urllib.parse import urlparse, parse_qs
  14. class DyComment:
  15. def __init__(self):
  16. self.common_utils = CommonUtils()
  17. self.comment_list_headers = {
  18. 'sec-ch-ua':'"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
  19. 'Accept':'application/json, text/plain, */*',
  20. 'sec-ch-ua-mobile':'?0',
  21. 'User-Agent':self.common_utils.user_agent,
  22. 'sec-ch-ua-platform':'"Windows"',
  23. 'Sec-Fetch-Site':'same-origin',
  24. 'Sec-Fetch-Mode':'cors',
  25. 'Sec-Fetch-Dest':'empty',
  26. 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
  27. }
  28. def get_comment_list(self, req_url):
  29. if ('modal_id' in req_url):
  30. aweme_id = parse_qs(urlparse(req_url).query).get('modal_id')[0]
  31. else:
  32. aweme_id = urlparse(req_url).path.split("/")[-1]
  33. referer_url = f"https://www.douyin.com/discover?modal_id={aweme_id}"
  34. ms_token = self.common_utils.get_ms_token()
  35. ttwid_str, webid = self.common_utils.get_ttwid_webid(referer_url)
  36. comment_lsit_req_url = f"https://www.douyin.com/aweme/v1/web/comment/list/?device_platform=webapp&aid=6383&channel=channel_pc_web&aweme_id={aweme_id}&cursor=0&count=20&item_type=0&insert_ids=&whale_cut_token=&cut_version=1&rcFT=&update_version_code=170400&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=123.0.0.0&browser_online=true&engine_name=Blink&engine_version=123.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid={webid}&verifyFp=verify_lwg2oa43_Ga6DRjOO_v2cd_4NL7_AHTp_qMKyKlDdoqra&fp=verify_lwg2oa43_Ga6DRjOO_v2cd_4NL7_AHTp_qMKyKlDdoqra&msToken={ms_token}"
  37. comment_list_headers1 = copy.deepcopy(self.comment_list_headers)
  38. comment_list_headers1['Referer'] = referer_url
  39. comment_list_headers1['Cookie'] = f'ttwid={ttwid_str};'
  40. abogus = self.common_utils.get_abogus(comment_lsit_req_url, self.common_utils.user_agent)
  41. url = comment_lsit_req_url + "&a_bogus=" + abogus
  42. response = requests.request("GET", url, headers=comment_list_headers1,verify=False, timeout=3)
  43. if (response.text):
  44. req_json = response.json()
  45. total = req_json.get('total')
  46. comments = req_json.get('comments')
  47. if (comments):
  48. for comment_index in range(len(comments)):
  49. comment_item = comments[comment_index]
  50. print(f"爬取成功:{comment_item.get('user').get('nickname')}:{comment_item.get('text')}")
  51. else:
  52. print(f"爬取结束:评论数={total}")
  53. else:
  54. print(f"爬取失败或没有评论")
  55. if __name__ == '__main__':
  56. req_url = "https://www.douyin.com/discover?modal_id=7258913772092296485"
  57. dy_comment = DyComment()
  58. dy_comment.get_comment_list(req_url)