In [4]:
import requests as r
from bs4 import BeautifulSoup, Tag
import json
import os
from threading import Thread
from urllib.parse import urlparse

In [5]:
def download_worker(media_url, save_path):
    res = r.get(media_url)
    with open(save_path, 'wb') as fp:
        fp.write(res.content)
    print('[crawler] 图像已经保存至', save_path)

In [14]:
titles = []
fail_ids = []

def crawler_issue(issue_id: str):
    folder = 'issue-' + str(issue_id)
    os.makedirs('../docs/digital-issue/' + folder, exist_ok=True)
    res = r.get('https://github.com/Digital-EDA/Digital-IDE/issues/' + str(issue_id), timeout=10)
    soup = BeautifulSoup(res.text, 'html.parser')

    texts = []
    el = soup.find('bdi')
    if isinstance(el, Tag):
        title = el.text
        titles.append(title)
        texts.append('问题标题 '+ title)
    
    for td in soup.find_all('td', attrs={ 'class': 'js-comment-body' }):
        if isinstance(td, Tag):
            for p in td.find_all('p'):
                if isinstance(p, Tag):
                    text = p.text
                    texts.append(text)
                    for a in p.find_all('a'):
                        href = a.attrs['href']
                        if href and '.png' in href:
                            urlp = urlparse(href)
                            name = urlp.path.split('/')[-1]
                            save_path = '../docs/digital-issue/' + folder + '/' + name
                            t = Thread(target=download_worker, args=(href, save_path))
                            t.start()

    text = '\n'.join(texts)
    with open('../docs/digital-issue/{}/issue.md'.format(folder), 'w', encoding='utf-8') as fp:
        fp.write(text)    

In [5]:
crawler_issue(67)

[crawler] 图像已经保存至 ../docs/digital-issue/issue-67/331660157-920d7143-f262-42d5-af57-a817bf3aee01.png


In [15]:
def crawler_issue_page(page_url: str):
    res = r.get(page_url)
    if res.status_code != 200:
        print('page url {} return 404'.format(page_url))
        return
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    issue_container = soup.find('div', { 'class': 'js-navigation-container js-active-navigation-container' })
    issue_ids = []
    for div in issue_container.children:
        if isinstance(div, Tag):
            id = div.attrs['id'].split('_')[-1]
            issue_ids.append(int(id))
    
    for issue_id in issue_ids:
        try:
            print('爬取 issue-{} 中 ...'.format(issue_id))
            crawler_issue(issue_id)
        except Exception as e:
            fail_ids.append(issue_id)
            print('爬取 {} 失败'.format(issue_id))

In [16]:
page_urls = [
    'https://github.com/Digital-EDA/Digital-IDE/issues?page=1&q=',
    'https://github.com/Digital-EDA/Digital-IDE/issues?page=2&q=',
    'https://github.com/Digital-EDA/Digital-IDE/issues?page=3&q='
]

titles.clear()
fail_ids.clear()
for url in page_urls:
    crawler_issue_page(url)

爬取 issue-71 中 ...
爬取 issue-70 中 ...
爬取 issue-69 中 ...
爬取 issue-68 中 ...
[crawler] 图像已经保存至 ../docs/digital-issue/issue-69/331220146-9deeccbf-cc0e-4810-bdd9-80e11d083c15.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-69/331220242-9abe8b7b-5985-4c1a-ac0f-30aba75ef8d2.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-69/331220203-3bb8aefd-e04a-4eb7-ae87-ca48f1daa120.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-69/331220080-b0c5f0af-e38c-4819-9efa-7491650ddb92.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-69/331219978-d22a5a5d-da00-430c-b966-68517ab264c0.png
爬取 issue-67 中 ...
爬取 issue-66 中 ...
爬取 66 失败
爬取 issue-65 中 ...
[crawler] 图像已经保存至 ../docs/digital-issue/issue-67/331660157-920d7143-f262-42d5-af57-a817bf3aee01.png
爬取 issue-64 中 ...
[crawler] 图像已经保存至 ../docs/digital-issue/issue-65/328185054-e08c66b2-7e87-4238-88cb-e0672b2de530.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-65/328184842-2e13483e-4ece-4eb6-8c8a-3d9c92a97651.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-6

[crawler] 图像已经保存至 ../docs/digital-issue/issue-7/260630481-0cbc73cf-f516-4b3d-92f5-17598f089297.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-4/255386339-56413f1c-bb32-49c0-aa85-dceeceb8594a.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-6/260716833-f3d89a67-7b4f-4daa-9a0a-8313dcf9caaa.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-2/253884964-cbaf42fa-fa7d-48ed-8353-184dd0895a12.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-2/256974788-7c2e9bf0-d239-4022-9aac-f8f160afad6f.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-2/255247553-45d154cc-37d1-459d-80d3-adad6324de4c.png
[crawler] 图像已经保存至 ../docs/digital-issue/issue-2/256974788-7c2e9bf0-d239-4022-9aac-f8f160afad6f.png


In [19]:
for issue_id in fail_ids:
    print(issue_id)

66
54
50
31
19


In [23]:
import yaml
target_yml = '../config/github-issue.story.yml'

story_config = {
    'stories': []
}

for title in titles:
    story_config['stories'].append({
        'message': title,
        'intent': 'bug'
    })
with open(target_yml, 'w', encoding='utf-8') as fp:
    yaml.dump(story_config, fp, Dumper=yaml.Dumper, indent=2, allow_unicode=True)

Exception in thread Thread-66 (download_worker):
Traceback (most recent call last):
  File "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    self._validate_conn(conn)
  File "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1099, in _validate_conn
    conn.connect()
  File "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/urllib3/connection.py", line 653, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/urllib3/connection.py", line 806, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 465, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, 