mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-12 11:37:09 +08:00
gooood
This commit is contained in:
parent
e9c65585cb
commit
bde9b2fe55
13
gooood/README.md
Normal file
13
gooood/README.md
Normal file
@ -0,0 +1,13 @@
|
||||
# 案例
|
||||
此网站爬虫部分简单,清洗逻辑繁琐一点正文内容以后台生成html展示。
|
||||
|
||||
**项目结构**
|
||||
|
||||
1. docs是存储结果目录
|
||||
2. spider是爬虫文件目录
|
||||
3. extrator是解析文件目录
|
||||
|
||||
|
||||
**几个问题点**
|
||||
1. 这只是个简单案例,此项目没有测试用例,无法保证代码长期正常运行
|
||||
2. 项目信息2逻辑比较琐碎,时间有限,暂不处理
|
0
gooood/dao/__init__.py
Normal file
0
gooood/dao/__init__.py
Normal file
34
gooood/dao/file_dao.py
Normal file
34
gooood/dao/file_dao.py
Normal file
@ -0,0 +1,34 @@
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
|
||||
class File(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def write_json(self, filename, data_list):
|
||||
with open(f'../docs/{filename}.json', 'w') as f:
|
||||
for item in data_list:
|
||||
json.dump(item, f)
|
||||
f.write("\n")
|
||||
|
||||
def write(self, filename, data_list, columns_list):
|
||||
filename = f'../docs/{filename}.csv'
|
||||
try:
|
||||
frame = pd.DataFrame(
|
||||
data_list,
|
||||
columns=columns_list
|
||||
)
|
||||
frame.to_csv(filename)
|
||||
except Exception as e:
|
||||
raise e
|
||||
return True
|
||||
|
||||
def read(self, filename):
|
||||
try:
|
||||
filename = f'../docs/{filename}.csv'
|
||||
frame = pd.read_csv(filename)
|
||||
except Exception as e:
|
||||
raise e
|
||||
return frame
|
60
gooood/docs/结果表.json
Normal file
60
gooood/docs/结果表.json
Normal file
File diff suppressed because one or more lines are too long
0
gooood/extrator/__init__.py
Normal file
0
gooood/extrator/__init__.py
Normal file
64
gooood/extrator/base.py
Normal file
64
gooood/extrator/base.py
Normal file
@ -0,0 +1,64 @@
|
||||
from setting import PLACEHOLDER
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
|
||||
class BaseExtractor(object):
|
||||
|
||||
def parse_paragraph(self, selector, content_list, tag='p'):
|
||||
"""
|
||||
arguemnts:
|
||||
selector: scrapy.Selector.
|
||||
content_list: article content list.
|
||||
tag: paragraph tag.
|
||||
"""
|
||||
text = ''.join(selector.css(tag).xpath(
|
||||
'node()').extract()).strip()
|
||||
if text:
|
||||
text = remove_tags(
|
||||
text, which_ones=('a', 'meta', 'script', 'span'))
|
||||
content_list.append(text)
|
||||
|
||||
def parse_image(self, **kwargs):
|
||||
'''
|
||||
arguemnts:
|
||||
selector: scrapy.Selector.
|
||||
src: image src css path.
|
||||
desc: description css path, if desc_path doesn't
|
||||
exist, then description is a empty string.
|
||||
content: article content list.
|
||||
resource: article resource list.
|
||||
count: image counter.
|
||||
'''
|
||||
kwargs['type'] = 'image'
|
||||
return self.parse_resource(**kwargs)
|
||||
|
||||
def parse_resource(self, **kwargs):
|
||||
placeholder = PLACEHOLDER[kwargs['type']]
|
||||
count = kwargs['count']
|
||||
content_list = kwargs['content']
|
||||
resource_list = kwargs['resource']
|
||||
sel = kwargs['selector']
|
||||
src = '@src'
|
||||
if 'src_attr' in kwargs:
|
||||
src = kwargs['src_attr']
|
||||
src = sel.css(kwargs['src']).xpath(src).extract()
|
||||
desc = ''
|
||||
if 'desc' in kwargs:
|
||||
desc = ''.join(sel.css(kwargs['desc']).extract()).strip()
|
||||
if len(src) > 1:
|
||||
for s in src:
|
||||
ref = placeholder.format(count)
|
||||
res = {'src': s, 'desc': desc, 'ref': ref}
|
||||
resource_list.append(res)
|
||||
content_list.append(ref)
|
||||
count += 1
|
||||
return count
|
||||
src = src[0]
|
||||
if not src:
|
||||
return count
|
||||
ref = placeholder.format(count)
|
||||
count += 1
|
||||
res = {'ref': ref, 'src': src, 'desc': desc}
|
||||
resource_list.append(res)
|
||||
content_list.append(res['ref'])
|
||||
return count
|
105
gooood/extrator/clean_detail.py
Normal file
105
gooood/extrator/clean_detail.py
Normal file
@ -0,0 +1,105 @@
|
||||
from base import BaseExtractor
|
||||
import json
|
||||
import re
|
||||
import scrapy
|
||||
from dao.file_dao import File
|
||||
from lxml import etree
|
||||
|
||||
|
||||
class CleanDetail(BaseExtractor):
|
||||
|
||||
def __init__(self):
|
||||
self.file = File()
|
||||
|
||||
def clean(self):
|
||||
df = self.file.read('raw_gd_detail')
|
||||
list_res = []
|
||||
for response_text in df["response_text"].values:
|
||||
selector = etree.HTML(response_text)
|
||||
title = selector.xpath('//h1/text()')[0]
|
||||
subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
|
||||
url = selector.xpath('//meta[@property="og:url"]/@content')[0]
|
||||
|
||||
design_company_str = selector.xpath(
|
||||
'//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
|
||||
if design_company_str:
|
||||
design_company = design_company_str[0]
|
||||
else:
|
||||
design_company = ''
|
||||
|
||||
location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
|
||||
if location_str:
|
||||
location = location_str[0]
|
||||
else:
|
||||
location = ''
|
||||
|
||||
type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
|
||||
if type_c_str:
|
||||
type_c = type_c_str[0]
|
||||
else:
|
||||
type_c = ''
|
||||
|
||||
material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
|
||||
if material_str:
|
||||
material = material_str[0]
|
||||
else:
|
||||
material = ''
|
||||
|
||||
label_str = selector.xpath(
|
||||
'//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
|
||||
if label_str:
|
||||
label = label_str
|
||||
else:
|
||||
label = []
|
||||
|
||||
classification_str = selector.xpath(
|
||||
'//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
|
||||
if classification_str:
|
||||
classification = classification_str[0]
|
||||
else:
|
||||
classification = ''
|
||||
|
||||
item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
|
||||
item_dict = json.loads(item)
|
||||
content_html = item_dict.get('post').get('content')
|
||||
selector = scrapy.Selector(text=content_html, type='html')
|
||||
nodes = []
|
||||
nodes.extend(selector.xpath('//p').extract())
|
||||
images, icount = [], 1
|
||||
contents = []
|
||||
for node in nodes:
|
||||
sel = scrapy.Selector(text=node, type='html')
|
||||
if sel.css('img ::attr(class)'):
|
||||
icount = self.parse_image(
|
||||
selector=sel, src='img', resource=images,
|
||||
count=icount, content=contents, src_attr='@data-src')
|
||||
elif sel.css('p') and '{margin:' not in node:
|
||||
self.parse_paragraph(sel, contents)
|
||||
elif sel.css('center ::text'):
|
||||
self.parse_paragraph(sel, contents, tag='center')
|
||||
|
||||
item = {
|
||||
"title": title,
|
||||
"subtitle": subtitle,
|
||||
"url": url,
|
||||
"project_1": {
|
||||
'design_company': design_company,
|
||||
'location': location,
|
||||
'type_c': type_c,
|
||||
'material': material,
|
||||
'label': label,
|
||||
'classification': classification
|
||||
},
|
||||
"contents": contents,
|
||||
"images": images
|
||||
}
|
||||
list_res.append(item)
|
||||
self.file.write_json('结果表', list_res)
|
||||
|
||||
def run(self):
|
||||
self.clean()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
clean_detail = CleanDetail()
|
||||
clean_detail.run()
|
0
gooood/main.py
Normal file
0
gooood/main.py
Normal file
0
gooood/spider/__init__.py
Normal file
0
gooood/spider/__init__.py
Normal file
31
gooood/spider/gooood_detail.py
Normal file
31
gooood/spider/gooood_detail.py
Normal file
@ -0,0 +1,31 @@
|
||||
from dao.file_dao import File
|
||||
import requests
|
||||
|
||||
|
||||
file = File()
|
||||
|
||||
|
||||
def gooood_detail(address):
|
||||
url = f"https://www.gooood.cn/{address}.htm"
|
||||
payload = {}
|
||||
headers = {'cookie': 'language=zh_CN;'}
|
||||
response = requests.request("GET", url, headers=headers, data=payload)
|
||||
return response.text
|
||||
|
||||
|
||||
def run():
|
||||
df = file.read('raw_gd_list')
|
||||
response_text_list = []
|
||||
|
||||
for slug in df["slug"].values:
|
||||
print(slug)
|
||||
response_text = gooood_detail(slug)
|
||||
response_text_list.append([response_text])
|
||||
|
||||
status = file.write('raw_gd_detail', response_text_list, ['response_text'])
|
||||
if status:
|
||||
print('详情页面抓取完成')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
30
gooood/spider/gooood_list.py
Normal file
30
gooood/spider/gooood_list.py
Normal file
@ -0,0 +1,30 @@
|
||||
from dao.file_dao import File
|
||||
import requests
|
||||
|
||||
|
||||
file = File()
|
||||
|
||||
|
||||
def gooood_list(page):
|
||||
url = f"https://dashboard.gooood.cn/api/wp/v2/posts?page={page}&per_page=20"
|
||||
payload = {}
|
||||
headers = {'cookie': 'language=zh_CN;'}
|
||||
response = requests.request("GET", url, headers=headers, data=payload)
|
||||
return response.json()
|
||||
|
||||
|
||||
def run():
|
||||
slug_list = []
|
||||
for page in range(1, 4):
|
||||
res_list = gooood_list(page)
|
||||
for res in res_list:
|
||||
slug = res.get('slug')
|
||||
slug_list.append([slug])
|
||||
|
||||
status = file.write('raw_gd_list', slug_list, ['slug'])
|
||||
if status:
|
||||
print('列表页面抓取完成')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
Loading…
x
Reference in New Issue
Block a user