153 lines
6.0 KiB
Python
153 lines
6.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
import os
|
|
import json
|
|
import time
|
|
import traceback
|
|
|
|
import pymysql
|
|
import requests
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
|
|
}
|
|
|
|
|
|
def get_db_config():
|
|
if os.path.exists("db_config.json"):
|
|
with open("db_config.json", "r") as f:
|
|
config = json.loads(f.read())
|
|
return config
|
|
else:
|
|
raise Exception("请检查数据库配置文件db_config.json是否存在")
|
|
|
|
|
|
class dbutil:
|
|
|
|
def __init__(self):
|
|
config = get_db_config()
|
|
self.db = pymysql.connect(host=config["host"],
|
|
user=config["user"],
|
|
password=config["password"],
|
|
database=config["database"])
|
|
def insert(self, sql):
|
|
# 打开数据库连接
|
|
|
|
# 使用cursor()方法获取操作游标
|
|
cursor = self.db.cursor()
|
|
# SQL 插入语句
|
|
try:
|
|
# 执行sql语句
|
|
cursor.execute(sql)
|
|
# 提交到数据库执行
|
|
self.db.commit()
|
|
except:
|
|
# 如果发生错误则回滚
|
|
traceback.print_exc()
|
|
self.db.rollback()
|
|
self.db.close()
|
|
|
|
def search(self, sql):
|
|
try:
|
|
# 执行SQL语句
|
|
cursor = self.db.cursor()
|
|
cursor.execute(sql)
|
|
# 获取所有记录列表
|
|
results = cursor.fetchall()
|
|
return results
|
|
except:
|
|
traceback.print_exc()
|
|
|
|
|
|
def parse_comments(username, product_id, page_limit=20):
|
|
"""
|
|
爬取评论数据
|
|
username: 用户名
|
|
product_id: 产品名称
|
|
page_limit: 爬取页数,这里默认最大设置为20
|
|
"""
|
|
comments_url = "https://you.163.com/xhr/comment/listByItemByTag.json?__timestamp={}&itemId={}&size=20&page={}&orderBy=0".format(
|
|
time.time() * 1000, product_id, 1)
|
|
res = requests.get(comments_url, headers=headers).json()
|
|
if res:
|
|
pages = res.get("data", {}).get("pagination", {}).get("totalPage", 1)
|
|
# 我们这里最多只爬取page_limit页,一次爬多容易触发反爬虫机制
|
|
pages = pages if pages <= page_limit else page_limit
|
|
for page in range(1, pages + 1):
|
|
comments_url = "https://you.163.com/xhr/comment/listByItemByTag.json?__timestamp={}&itemId={}&size=20&page={}&orderBy=0".format(
|
|
time.time() * 1000, product_id, page)
|
|
res = requests.get(comments_url, headers=headers).json()
|
|
for item in res.get("data", {}).get("commentList", []):
|
|
try:
|
|
comment = item.get("content", "")
|
|
update_time = item.get("createTime", -1)
|
|
sql = "insert into tb_comments(username,url,product_id,comment,update_time) values ('{}','{}','{}','{}',{});".format(username,
|
|
comments_url,
|
|
product_id,
|
|
comment,
|
|
update_time)
|
|
dbutil().insert(sql)
|
|
except Exception as e:
|
|
traceback.print_exc(e)
|
|
|
|
|
|
def parse_tags(username, product_id):
|
|
"""
|
|
爬取评论的标签,以及标签数量
|
|
username: 用户名
|
|
product_id: 产品名称
|
|
"""
|
|
tags_url = "https://you.163.com/xhr/comment/tags.json?__timestamp={}&itemId={}".format(time.time() * 1000, product_id)
|
|
res = requests.get(tags_url, headers=headers).json()
|
|
for item in res.get("data", {}):
|
|
try:
|
|
tag = item.get("name", "")
|
|
count = item.get("strCount", 0)
|
|
sql = "insert into tb_tags(username,url,product_id,tag,count,update_time) values ('{}','{}','{}','{}',{},{});".format(username, tags_url,
|
|
product_id, tag,
|
|
count,
|
|
int(time.time()))
|
|
dbutil().insert(sql)
|
|
except Exception as e:
|
|
print(sql)
|
|
traceback.print_exc(e)
|
|
|
|
|
|
def get_data(username, product_id):
|
|
"""
|
|
检查这个product_id是否爬取过,爬取过的评论会在数据库中保存,直接取出来分析即可,如果没有则进行爬取
|
|
:param username:
|
|
:param product_id:
|
|
:return:
|
|
"""
|
|
sql = "select * from {} where product_id='{}' and username='{}'"
|
|
res = {
|
|
"tags": {},
|
|
"comments": None
|
|
}
|
|
# 如果有返回值
|
|
rtn = dbutil().search(sql.format("tb_comments", product_id, username))
|
|
|
|
if rtn:
|
|
pass
|
|
# 如果没有返回值,说明没有爬取,这里开始爬取评论数据
|
|
else:
|
|
parse_comments(username, product_id)
|
|
parse_tags(username, product_id)
|
|
rtn = dbutil().search(sql.format("tb_comments", product_id, username))
|
|
# 统一处理数据返回
|
|
# 获取商品的所有评论,查tb_comments表
|
|
res["comments"] = "".join([i[2] for i in rtn])
|
|
# 获取商品的各种评论标签以及对应数量,查tb_tags表
|
|
tags = dbutil().search(sql.format("tb_tags", product_id, username))
|
|
if tags:
|
|
for tag in tags:
|
|
_tag, count = tag[2], tag[3]
|
|
res["tags"][_tag] = count
|
|
return res
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# parse_tags("lzh", "1085007")
|
|
# check_exists("lzh", "1085007")
|
|
print(get_data("lzh", "1085007"))
|