1,安装python 3.x
2,执行下列代码,记得先修改main()内的信息
import os
import re
import parsel
import requests
import tomd
def get_article_info(url):
html = requests.get(url, headers=headers).text
selector = parsel.Selector(html)
urls = selector.css('.blog-list-box > a').xpath('.//@href').getall()
print('共找到%d篇文章...' % len(urls))
return urls
def get_html_from_csdn(url):
html = requests.get(url, headers=headers).text
selector = parsel.Selector(html)
title = selector.css('div.article-title-box > h1::text').get()
article = selector.css('div.article_content').get()
category = selector.css('div.blog-tags-box > div > a::text').getall()[0]
tags = selector.css('div.blog-tags-box > div > a[data-report-click*="mod"]::text').getall()
time_stamp = selector.css('div > span.time::text').get()
author = selector.css('#uid > span.name::text').get()
origin = url
return title, article, category, tags, time_stamp, author, origin
def html_to_md(title, article, category, tags, time_stamp, author, origin):
md = tomd.convert(article)
# 图片url标准化
url_pattern = re.compile(r'<img.*?(https://.*?\.gif|https://.*?\.png).*?">')
for src_url in url_pattern.finditer(md):
img_name = src_url.group(1).split('/')[-1]
md = md.replace(src_url.group(0), '' % (img_name, src_url.group(1)))
print('正在下载 %s' % title)
text = "---\ntitle: %s\ndate: %s\ntags: [%s]\ncategories: %s\n---\n\n> 作者: %s\n> 原文链接: %s\n%s" % (
title, time_stamp, ', '.join(tags), category, author, origin, md)
# Windows下文件名字不能包含特殊符号
file_name = re.sub(r'[\\/:*?"<>|]', ' ', title)
with open('articles/%s.md' % file_name.strip(), 'w', encoding='utf-8') as f:
f.write(text)
def main(url):
if not os.path.exists('articles'):
os.mkdir('articles')
article_urls = get_article_info(url)
for article_url in article_urls:
title, article, category, tags, time_stamp, author, origin = get_html_from_csdn(article_url)
html_to_md(title, article, category, tags, time_stamp, author, origin)
print('完成%d篇文章的下载' % len(article_urls))
def main1(url):
title, article, category, tags, time_stamp, author, origin = get_html_from_csdn(url)
html_to_md(title, article, category, tags, time_stamp, author, origin)
print('完成'+title+'的下载')
if __name__ == '__main__':
headers = {
'Host': 'blog.csdn.net',
'Referer': 'https://blog.csdn.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3542.0 Safari/537.36'
}
start_url = "https://blog.csdn.net/lenkty"
#main(start_url )
iurl="https://blog.csdn.net/lenkty/article/details/86591364"
main1(iurl);
3,在articles文件目录下查看文章列表
更多信息请关注公众号: