mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 10:25:01 +08:00
106 lines
3.8 KiB
Python
106 lines
3.8 KiB
Python
from base import BaseExtractor
|
|
import json
|
|
import re
|
|
import scrapy
|
|
from dao.file_dao import File
|
|
from lxml import etree
|
|
|
|
|
|
class CleanDetail(BaseExtractor):
|
|
|
|
def __init__(self):
|
|
self.file = File()
|
|
|
|
def clean(self):
|
|
df = self.file.read('raw_gd_detail')
|
|
list_res = []
|
|
for response_text in df["response_text"].values:
|
|
selector = etree.HTML(response_text)
|
|
title = selector.xpath('//h1/text()')[0]
|
|
subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
|
|
url = selector.xpath('//meta[@property="og:url"]/@content')[0]
|
|
|
|
design_company_str = selector.xpath(
|
|
'//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
|
|
if design_company_str:
|
|
design_company = design_company_str[0]
|
|
else:
|
|
design_company = ''
|
|
|
|
location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
|
|
if location_str:
|
|
location = location_str[0]
|
|
else:
|
|
location = ''
|
|
|
|
type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
|
|
if type_c_str:
|
|
type_c = type_c_str[0]
|
|
else:
|
|
type_c = ''
|
|
|
|
material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
|
|
if material_str:
|
|
material = material_str[0]
|
|
else:
|
|
material = ''
|
|
|
|
label_str = selector.xpath(
|
|
'//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
|
|
if label_str:
|
|
label = label_str
|
|
else:
|
|
label = []
|
|
|
|
classification_str = selector.xpath(
|
|
'//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
|
|
if classification_str:
|
|
classification = classification_str[0]
|
|
else:
|
|
classification = ''
|
|
|
|
item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
|
|
item_dict = json.loads(item)
|
|
content_html = item_dict.get('post').get('content')
|
|
selector = scrapy.Selector(text=content_html, type='html')
|
|
nodes = []
|
|
nodes.extend(selector.xpath('//p').extract())
|
|
images, icount = [], 1
|
|
contents = []
|
|
for node in nodes:
|
|
sel = scrapy.Selector(text=node, type='html')
|
|
if sel.css('img ::attr(class)'):
|
|
icount = self.parse_image(
|
|
selector=sel, src='img', resource=images,
|
|
count=icount, content=contents, src_attr='@data-src')
|
|
elif sel.css('p') and '{margin:' not in node:
|
|
self.parse_paragraph(sel, contents)
|
|
elif sel.css('center ::text'):
|
|
self.parse_paragraph(sel, contents, tag='center')
|
|
|
|
item = {
|
|
"title": title,
|
|
"subtitle": subtitle,
|
|
"url": url,
|
|
"project_1": {
|
|
'design_company': design_company,
|
|
'location': location,
|
|
'type_c': type_c,
|
|
'material': material,
|
|
'label': label,
|
|
'classification': classification
|
|
},
|
|
"contents": contents,
|
|
"images": images
|
|
}
|
|
list_res.append(item)
|
|
self.file.write_json('结果表', list_res)
|
|
|
|
def run(self):
|
|
self.clean()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
clean_detail = CleanDetail()
|
|
clean_detail.run()
|