# _*_coding:utf-8_*_ from hashlib import new import json import os import sys import time from multiprocessing import Process import time from datetime import datetime, timedelta import os import pickle import calendar import re from copy import deepcopy import requests import csv from commandline_config import Config from service_invoke import invokeService class TimeUtil(object): @classmethod def parse_timezone(cls, timezone): """ 解析时区表示 :param timezone: str eg: +8 :return: dict{symbol, offset} """ result = re.match(r'(?P[+-])(?P\d+)', timezone) symbol = result.groupdict()['symbol'] offset = int(result.groupdict()['offset']) return { 'symbol': symbol, 'offset': offset } @classmethod def convert_timezone(cls, dt, timezone="+0"): """默认是utc时间,需要""" result = cls.parse_timezone(timezone) symbol = result['symbol'] offset = result['offset'] if symbol == '+': return dt + timedelta(hours=offset) elif symbol == '-': return dt - timedelta(hours=offset) else: raise Exception('dont parse timezone format') def generate_timestamp(): current_GMT = time.gmtime() # ts stores timestamp ts = calendar.timegm(current_GMT) current_time = datetime.utcnow() convert_now = TimeUtil.convert_timezone(current_time, '+8') print("current_time: " + str(convert_now)) return str(convert_now) def main(): # result = os.popen('python ServiceWrapper_ExecuteStage.py 38') # res = result.read() # for line in res.splitlines(): # print("\n\n\n\nfinename:\n\n\n\n\n", line) config = { "pages": 5, "test": False, "test_pages": 3, } c = Config(config) print(c) csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8')) author_list = [] for line in csv_reader: author_list.append(line[4]) csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8')) keywords = [] i = 0 for line in csv_reader: if line[0] not in author_list: keywords.append(line[0]) else: print("Will not append keyword %s", line[0]) i += 1 if c.test and i > c.test_pages * 100: break # print("author_list:", author_list) # exit(0) urlList = "" i = 0 for keyword in keywords: url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword # print(url) urlList += url i += 1 if c.test and i > c.test_pages: break print(urlList) # exit(0) # result = requests.post( # "http://servicewrapper.naibo.wang/backEnd/invokeService", # data={"id": 7, # serviceID # "paras": json.dumps({"urlList_0": urlList, # }), # }) # descTaskID = int(result.text) descTaskID = invokeService( 1, {"urlList_0": urlList}) print("descTaskID: " + str(descTaskID)) # exit(0) filename = generate_timestamp().replace(" ", "").replace(":", "-") print("filename:", filename) command = 'python ServiceWrapper_ExecuteStage_local.py ' + \ str(descTaskID) + ' ' + filename result = os.system(command) # authorTaskID = 53 file_name = "task_" + str(descTaskID) + "_" + filename + ".csv" # file_name = "task_53_2022-10-1723-35-40.881448.csv" print("file_name:", file_name) csv_reader = csv.reader( open("./Data/"+file_name, encoding='utf-8')) # taskID new_descTaskID = [] i = 0 for line in csv_reader: # print(line) if i > 0: new_descTaskID.append(line) i += 1 # print(new_author_list) # new_descTaskID = list(set([tuple(t) for t in new_descTaskID])) # new_descTaskID = list(set(new_descTaskID)) # 去重 after_remove_duplicate = [] for i in range(len(new_descTaskID)): try: if i > 0: if new_descTaskID[i][2] == new_descTaskID[i-1][2]: continue if new_descTaskID[i][2] != "": zan = new_descTaskID[i][1].split("获赞")[0] fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1] follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1] after_remove_duplicate.append( [new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]]) except: pass print("after_remove_duplicate", after_remove_duplicate) all_collected = [] for author in after_remove_duplicate: all_collected.append(author[4]) print("all_collected:", all_collected) for keyword in keywords: if keyword not in all_collected: print("keyword not collected:", keyword) after_remove_duplicate.append(['', '', '', '', keyword, '']) new_descTaskID = after_remove_duplicate print("new_descTaskID:", new_descTaskID) # for i in range(len(keywords)): # author_list[i] = [keywords[i]].extend(new_descTaskID[i]) # for row in author_list: # print(row) with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in new_descTaskID: writer.writerow(row) import xlwt csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8')) all_data = [] for line in csv_reader: all_data.append(line) workbook = xlwt.Workbook() sheet = workbook.add_sheet("Sheet") for i in range(len(all_data)): for j in range(len(all_data[i])): sheet.write(i, j, all_data[i][j]) workbook.save("all_data.xls") if __name__ == "__main__": main()