mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-19 18:59:52 +08:00
203 lines
5.9 KiB
Python
203 lines
5.9 KiB
Python
# _*_coding:utf-8_*_
|
||
from hashlib import new
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from multiprocessing import Process
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
import os
|
||
import pickle
|
||
import calendar
|
||
import re
|
||
from copy import deepcopy
|
||
import requests
|
||
import csv
|
||
from commandline_config import Config
|
||
from service_invoke import invokeService
|
||
|
||
|
||
class TimeUtil(object):
|
||
@classmethod
|
||
def parse_timezone(cls, timezone):
|
||
"""
|
||
解析时区表示
|
||
:param timezone: str eg: +8
|
||
:return: dict{symbol, offset}
|
||
"""
|
||
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
|
||
symbol = result.groupdict()['symbol']
|
||
offset = int(result.groupdict()['offset'])
|
||
|
||
return {
|
||
'symbol': symbol,
|
||
'offset': offset
|
||
}
|
||
|
||
@classmethod
|
||
def convert_timezone(cls, dt, timezone="+0"):
|
||
"""默认是utc时间,需要"""
|
||
result = cls.parse_timezone(timezone)
|
||
symbol = result['symbol']
|
||
|
||
offset = result['offset']
|
||
|
||
if symbol == '+':
|
||
return dt + timedelta(hours=offset)
|
||
elif symbol == '-':
|
||
return dt - timedelta(hours=offset)
|
||
else:
|
||
raise Exception('dont parse timezone format')
|
||
|
||
|
||
def generate_timestamp():
|
||
current_GMT = time.gmtime()
|
||
# ts stores timestamp
|
||
ts = calendar.timegm(current_GMT)
|
||
|
||
current_time = datetime.utcnow()
|
||
convert_now = TimeUtil.convert_timezone(current_time, '+8')
|
||
print("current_time: " + str(convert_now))
|
||
return str(convert_now)
|
||
|
||
|
||
def main():
|
||
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
|
||
# res = result.read()
|
||
# for line in res.splitlines():
|
||
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
|
||
config = {
|
||
"pages": 5,
|
||
"test": False,
|
||
"test_pages": 3,
|
||
}
|
||
c = Config(config)
|
||
print(c)
|
||
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
|
||
author_list = []
|
||
for line in csv_reader:
|
||
author_list.append(line[4])
|
||
|
||
csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
|
||
keywords = []
|
||
i = 0
|
||
for line in csv_reader:
|
||
if line[0] not in author_list:
|
||
keywords.append(line[0])
|
||
else:
|
||
print("Will not append keyword %s", line[0])
|
||
i += 1
|
||
if c.test and i > c.test_pages * 100:
|
||
break
|
||
# print("author_list:", author_list)
|
||
# exit(0)
|
||
|
||
urlList = ""
|
||
i = 0
|
||
|
||
for keyword in keywords:
|
||
url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword
|
||
# print(url)
|
||
urlList += url
|
||
i += 1
|
||
if c.test and i > c.test_pages:
|
||
break
|
||
print(urlList)
|
||
# exit(0)
|
||
# result = requests.post(
|
||
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
|
||
# data={"id": 7, # serviceID
|
||
# "paras": json.dumps({"urlList_0": urlList,
|
||
# }),
|
||
# })
|
||
# descTaskID = int(result.text)
|
||
descTaskID = invokeService(
|
||
1, {"urlList_0": urlList})
|
||
print("descTaskID: " + str(descTaskID))
|
||
# exit(0)
|
||
filename = generate_timestamp().replace(" ", "").replace(":", "-")
|
||
print("filename:", filename)
|
||
|
||
command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
|
||
str(descTaskID) + ' ' + filename
|
||
result = os.system(command)
|
||
|
||
# authorTaskID = 53
|
||
file_name = "task_" + str(descTaskID) + "_" + filename + ".csv"
|
||
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
|
||
print("file_name:", file_name)
|
||
csv_reader = csv.reader(
|
||
open("./Data/"+file_name, encoding='utf-8')) # taskID
|
||
new_descTaskID = []
|
||
i = 0
|
||
for line in csv_reader:
|
||
# print(line)
|
||
if i > 0:
|
||
new_descTaskID.append(line)
|
||
i += 1
|
||
# print(new_author_list)
|
||
# new_descTaskID = list(set([tuple(t) for t in new_descTaskID]))
|
||
# new_descTaskID = list(set(new_descTaskID)) # 去重
|
||
|
||
after_remove_duplicate = []
|
||
for i in range(len(new_descTaskID)):
|
||
try:
|
||
if i > 0:
|
||
if new_descTaskID[i][2] == new_descTaskID[i-1][2]:
|
||
continue
|
||
if new_descTaskID[i][2] != "":
|
||
zan = new_descTaskID[i][1].split("获赞")[0]
|
||
fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1]
|
||
follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1]
|
||
after_remove_duplicate.append(
|
||
[new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]])
|
||
except:
|
||
pass
|
||
|
||
print("after_remove_duplicate", after_remove_duplicate)
|
||
|
||
all_collected = []
|
||
for author in after_remove_duplicate:
|
||
all_collected.append(author[4])
|
||
print("all_collected:", all_collected)
|
||
|
||
for keyword in keywords:
|
||
if keyword not in all_collected:
|
||
print("keyword not collected:", keyword)
|
||
after_remove_duplicate.append(['', '', '', '', keyword, ''])
|
||
|
||
new_descTaskID = after_remove_duplicate
|
||
|
||
print("new_descTaskID:", new_descTaskID)
|
||
|
||
# for i in range(len(keywords)):
|
||
# author_list[i] = [keywords[i]].extend(new_descTaskID[i])
|
||
# for row in author_list:
|
||
# print(row)
|
||
|
||
with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile:
|
||
writer = csv.writer(csvfile)
|
||
for row in new_descTaskID:
|
||
writer.writerow(row)
|
||
|
||
import xlwt
|
||
|
||
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
|
||
all_data = []
|
||
for line in csv_reader:
|
||
all_data.append(line)
|
||
|
||
workbook = xlwt.Workbook()
|
||
sheet = workbook.add_sheet("Sheet")
|
||
|
||
for i in range(len(all_data)):
|
||
for j in range(len(all_data[i])):
|
||
sheet.write(i, j, all_data[i][j])
|
||
|
||
workbook.save("all_data.xls")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|