gooood

2025-04-20 03:59:57 +08:00 · 2023-05-11 10:51:13 +08:00 · 2023-05-11 10:51:13 +08:00 · bde9b2fe55
commit bde9b2fe55
parent e9c65585cb
11 changed files with 337 additions and 0 deletions
--- a/gooood/README.md
+++ b/gooood/README.md
@ -0,0 +1,13 @@
 # 案例
 此网站爬虫部分简单，清洗逻辑繁琐一点正文内容以后台生成html展示。
 **项目结构**
 1. docs是存储结果目录
 2. spider是爬虫文件目录
 3. extrator是解析文件目录
 **几个问题点**
 1. 这只是个简单案例，此项目没有测试用例，无法保证代码长期正常运行
 2. 项目信息2逻辑比较琐碎，时间有限，暂不处理
--- a/gooood/dao/init.py
+++ b/gooood/dao/init.py
--- a/gooood/dao/file_dao.py
+++ b/gooood/dao/file_dao.py
@ -0,0 +1,34 @@
 import pandas as pd
 import json
 class File(object):
    def __init__(self):
        pass
    def write_json(self, filename, data_list):
        with open(f'../docs/{filename}.json', 'w') as f:
            for item in data_list:
                json.dump(item, f)
                f.write("\n")
    def write(self, filename, data_list, columns_list):
        filename = f'../docs/{filename}.csv'
        try:
            frame = pd.DataFrame(
                data_list,
                columns=columns_list
            )
            frame.to_csv(filename)
        except Exception as e:
            raise e
        return True
    def read(self, filename):
        try:
            filename = f'../docs/{filename}.csv'
            frame = pd.read_csv(filename)
        except Exception as e:
            raise e
        return frame
--- a/gooood/docs/结果表.json
+++ b/gooood/docs/结果表.json
--- a/gooood/extrator/init.py
+++ b/gooood/extrator/init.py
--- a/gooood/extrator/base.py
+++ b/gooood/extrator/base.py
@ -0,0 +1,64 @@
 from setting import PLACEHOLDER
 from w3lib.html import remove_tags
 class BaseExtractor(object):
    def parse_paragraph(self, selector, content_list, tag='p'):
        """
        arguemnts:
        selector: scrapy.Selector.
        content_list: article content list.
        tag: paragraph tag.
        """
        text = ''.join(selector.css(tag).xpath(
            'node()').extract()).strip()
        if text:
            text = remove_tags(
                text, which_ones=('a', 'meta', 'script', 'span'))
            content_list.append(text)
    def parse_image(self, **kwargs):
        '''
        arguemnts:
        selector: scrapy.Selector.
        src: image src css path.
        desc: description css path, if desc_path doesn't
                   exist, then description is a empty string.
        content: article content list.
        resource: article resource list.
        count: image counter.
        '''
        kwargs['type'] = 'image'
        return self.parse_resource(**kwargs)
    def parse_resource(self, **kwargs):
        placeholder = PLACEHOLDER[kwargs['type']]
        count = kwargs['count']
        content_list = kwargs['content']
        resource_list = kwargs['resource']
        sel = kwargs['selector']
        src = '@src'
        if 'src_attr' in kwargs:
            src = kwargs['src_attr']
        src = sel.css(kwargs['src']).xpath(src).extract()
        desc = ''
        if 'desc' in kwargs:
            desc = ''.join(sel.css(kwargs['desc']).extract()).strip()
        if len(src) > 1:
            for s in src:
                ref = placeholder.format(count)
                res = {'src': s, 'desc': desc, 'ref': ref}
                resource_list.append(res)
                content_list.append(ref)
                count += 1
            return count
        src = src[0]
        if not src:
            return count
        ref = placeholder.format(count)
        count += 1
        res = {'ref': ref, 'src': src, 'desc': desc}
        resource_list.append(res)
        content_list.append(res['ref'])
        return count
--- a/gooood/extrator/clean_detail.py
+++ b/gooood/extrator/clean_detail.py
@ -0,0 +1,105 @@
 from base import BaseExtractor
 import json
 import re
 import scrapy
 from dao.file_dao import File
 from lxml import etree
 class CleanDetail(BaseExtractor):
    def __init__(self):
        self.file = File()
    def clean(self):
        df = self.file.read('raw_gd_detail')
        list_res = []
        for response_text in df["response_text"].values:
            selector = etree.HTML(response_text)
            title = selector.xpath('//h1/text()')[0]
            subtitle = selector.xpath('//div[contains(@class,"single-content")]/h2/text()')[0]
            url = selector.xpath('//meta[@property="og:url"]/@content')[0]
            design_company_str = selector.xpath(
                '//span[contains(text(), "设计公司")]/../following-sibling::div[1]/a/text()')
            if design_company_str:
                design_company = design_company_str[0]
            else:
                design_company = ''
            location_str = selector.xpath('//span[contains(text(), "位置")]/../following-sibling::div[1]/a/text()')
            if location_str:
                location = location_str[0]
            else:
                location = ''
            type_c_str = selector.xpath('//span[contains(text(), "类型")]/../following-sibling::div[1]/a/text()')
            if type_c_str:
                type_c = type_c_str[0]
            else:
                type_c = ''
            material_str = selector.xpath('//span[contains(text(), "材料")]/../following-sibling::div[1]/a/text()')
            if material_str:
                material = material_str[0]
            else:
                material = ''
            label_str = selector.xpath(
                '//span[contains(text(), "标签")]/../following-sibling::div[1]/a/text()')
            if label_str:
                label = label_str
            else:
                label = []
            classification_str = selector.xpath(
                '//span[contains(text(), "分类")]/../following-sibling::div[1]/a/text()')
            if classification_str:
                classification = classification_str[0]
            else:
                classification = ''
            item = re.findall(r'window.__INITIAL_STATE__=(.*?)</script>', response_text)[0]
            item_dict = json.loads(item)
            content_html = item_dict.get('post').get('content')
            selector = scrapy.Selector(text=content_html, type='html')
            nodes = []
            nodes.extend(selector.xpath('//p').extract())
            images, icount = [], 1
            contents = []
            for node in nodes:
                sel = scrapy.Selector(text=node, type='html')
                if sel.css('img ::attr(class)'):
                    icount = self.parse_image(
                        selector=sel, src='img', resource=images,
                        count=icount, content=contents, src_attr='@data-src')
                elif sel.css('p') and '{margin:' not in node:
                    self.parse_paragraph(sel, contents)
                elif sel.css('center ::text'):
                    self.parse_paragraph(sel, contents, tag='center')
            item = {
                "title": title,
                "subtitle": subtitle,
                "url": url,
                "project_1": {
                    'design_company': design_company,
                    'location': location,
                    'type_c': type_c,
                    'material': material,
                    'label': label,
                    'classification': classification
                },
                "contents": contents,
                "images": images
            }
            list_res.append(item)
        self.file.write_json('结果表', list_res)
    def run(self):
        self.clean()
 if __name__ == '__main__':
    clean_detail = CleanDetail()
    clean_detail.run()
--- a/gooood/main.py
+++ b/gooood/main.py
--- a/gooood/spider/init.py
+++ b/gooood/spider/init.py
--- a/gooood/spider/gooood_detail.py
+++ b/gooood/spider/gooood_detail.py
@ -0,0 +1,31 @@
 from dao.file_dao import File
 import requests
 file = File()
 def gooood_detail(address):
    url = f"https://www.gooood.cn/{address}.htm"
    payload = {}
    headers = {'cookie': 'language=zh_CN;'}
    response = requests.request("GET", url, headers=headers, data=payload)
    return response.text
 def run():
    df = file.read('raw_gd_list')
    response_text_list = []
    for slug in df["slug"].values:
        print(slug)
        response_text = gooood_detail(slug)
        response_text_list.append([response_text])
    status = file.write('raw_gd_detail', response_text_list, ['response_text'])
    if status:
        print('详情页面抓取完成')
 if __name__ == '__main__':
    run()
--- a/gooood/spider/gooood_list.py
+++ b/gooood/spider/gooood_list.py
@ -0,0 +1,30 @@
 from dao.file_dao import File
 import requests
 file = File()
 def gooood_list(page):
    url = f"https://dashboard.gooood.cn/api/wp/v2/posts?page={page}&per_page=20"
    payload = {}
    headers = {'cookie': 'language=zh_CN;'}
    response = requests.request("GET", url, headers=headers, data=payload)
    return response.json()
 def run():
    slug_list = []
    for page in range(1, 4):
        res_list = gooood_list(page)
        for res in res_list:
            slug = res.get('slug')
            slug_list.append([slug])
    status = file.write('raw_gd_list', slug_list, ['slug'])
    if status:
        print('列表页面抓取完成')
 if __name__ == '__main__':
    run()