js_reverse/gooood/extrator/clean_detail.py
luzhisheng bde9b2fe55 gooood
2023-05-11 10:51:13 +08:00

106 lines
3.8 KiB
Python

from base import BaseExtractor
import json
import re
import scrapy
from dao.file_dao import File
from lxml import etree
class CleanDetail(BaseExtractor):
def __init__(self):
self.file = File()
def clean(self):
df = self.file.read('raw_gd_detail')
list_res = []
for response_text in df["response_text"].values:
selector = etree.HTML(response_text)
title = selector.xpath('//h1/text()')[0]
subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
url = selector.xpath('//meta[@property="og:url"]/@content')[0]
design_company_str = selector.xpath(
'//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
if design_company_str:
design_company = design_company_str[0]
else:
design_company = ''
location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
if location_str:
location = location_str[0]
else:
location = ''
type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
if type_c_str:
type_c = type_c_str[0]
else:
type_c = ''
material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
if material_str:
material = material_str[0]
else:
material = ''
label_str = selector.xpath(
'//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
if label_str:
label = label_str
else:
label = []
classification_str = selector.xpath(
'//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
if classification_str:
classification = classification_str[0]
else:
classification = ''
item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
item_dict = json.loads(item)
content_html = item_dict.get('post').get('content')
selector = scrapy.Selector(text=content_html, type='html')
nodes = []
nodes.extend(selector.xpath('//p').extract())
images, icount = [], 1
contents = []
for node in nodes:
sel = scrapy.Selector(text=node, type='html')
if sel.css('img ::attr(class)'):
icount = self.parse_image(
selector=sel, src='img', resource=images,
count=icount, content=contents, src_attr='@data-src')
elif sel.css('p') and '{margin:' not in node:
self.parse_paragraph(sel, contents)
elif sel.css('center ::text'):
self.parse_paragraph(sel, contents, tag='center')
item = {
"title": title,
"subtitle": subtitle,
"url": url,
"project_1": {
'design_company': design_company,
'location': location,
'type_c': type_c,
'material': material,
'label': label,
'classification': classification
},
"contents": contents,
"images": images
}
list_res.append(item)
self.file.write_json('结果表', list_res)
def run(self):
self.clean()
if __name__ == '__main__':
clean_detail = CleanDetail()
clean_detail.run()