Files
2022-07-06 16:15:08 +08:00

153 lines
6.0 KiB
Python

# -*- coding: utf-8 -*-
import os
import json
import time
import traceback
import pymysql
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
def get_db_config():
if os.path.exists("db_config.json"):
with open("db_config.json", "r") as f:
config = json.loads(f.read())
return config
else:
raise Exception("请检查数据库配置文件db_config.json是否存在")
class dbutil:
def __init__(self):
config = get_db_config()
self.db = pymysql.connect(host=config["host"],
user=config["user"],
password=config["password"],
database=config["database"])
def insert(self, sql):
# 打开数据库连接
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
# SQL 插入语句
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
traceback.print_exc()
self.db.rollback()
self.db.close()
def search(self, sql):
try:
# 执行SQL语句
cursor = self.db.cursor()
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
return results
except:
traceback.print_exc()
def parse_comments(username, product_id, page_limit=20):
"""
爬取评论数据
username: 用户名
product_id: 产品名称
page_limit: 爬取页数,这里默认最大设置为20
"""
comments_url = "https://you.163.com/xhr/comment/listByItemByTag.json?__timestamp={}&itemId={}&size=20&page={}&orderBy=0".format(
time.time() * 1000, product_id, 1)
res = requests.get(comments_url, headers=headers).json()
if res:
pages = res.get("data", {}).get("pagination", {}).get("totalPage", 1)
# 我们这里最多只爬取page_limit页,一次爬多容易触发反爬虫机制
pages = pages if pages <= page_limit else page_limit
for page in range(1, pages + 1):
comments_url = "https://you.163.com/xhr/comment/listByItemByTag.json?__timestamp={}&itemId={}&size=20&page={}&orderBy=0".format(
time.time() * 1000, product_id, page)
res = requests.get(comments_url, headers=headers).json()
for item in res.get("data", {}).get("commentList", []):
try:
comment = item.get("content", "")
update_time = item.get("createTime", -1)
sql = "insert into tb_comments(username,url,product_id,comment,update_time) values ('{}','{}','{}','{}',{});".format(username,
comments_url,
product_id,
comment,
update_time)
dbutil().insert(sql)
except Exception as e:
traceback.print_exc(e)
def parse_tags(username, product_id):
"""
爬取评论的标签,以及标签数量
username: 用户名
product_id: 产品名称
"""
tags_url = "https://you.163.com/xhr/comment/tags.json?__timestamp={}&itemId={}".format(time.time() * 1000, product_id)
res = requests.get(tags_url, headers=headers).json()
for item in res.get("data", {}):
try:
tag = item.get("name", "")
count = item.get("strCount", 0)
sql = "insert into tb_tags(username,url,product_id,tag,count,update_time) values ('{}','{}','{}','{}',{},{});".format(username, tags_url,
product_id, tag,
count,
int(time.time()))
dbutil().insert(sql)
except Exception as e:
print(sql)
traceback.print_exc(e)
def get_data(username, product_id):
"""
检查这个product_id是否爬取过,爬取过的评论会在数据库中保存,直接取出来分析即可,如果没有则进行爬取
:param username:
:param product_id:
:return:
"""
sql = "select * from {} where product_id='{}' and username='{}'"
res = {
"tags": {},
"comments": None
}
# 如果有返回值
rtn = dbutil().search(sql.format("tb_comments", product_id, username))
if rtn:
pass
# 如果没有返回值,说明没有爬取,这里开始爬取评论数据
else:
parse_comments(username, product_id)
parse_tags(username, product_id)
rtn = dbutil().search(sql.format("tb_comments", product_id, username))
# 统一处理数据返回
# 获取商品的所有评论,查tb_comments表
res["comments"] = "".join([i[2] for i in rtn])
# 获取商品的各种评论标签以及对应数量,查tb_tags表
tags = dbutil().search(sql.format("tb_tags", product_id, username))
if tags:
for tag in tags:
_tag, count = tag[2], tag[3]
res["tags"][_tag] = count
return res
if __name__ == '__main__':
# parse_tags("lzh", "1085007")
# check_exists("lzh", "1085007")
print(get_data("lzh", "1085007"))