使用BeautifulSoup和pdfkit爬取菜鸟教程到pdf文件!

2,119 11

锋利的BeautifulSoup:

BeautifulSoup是Python爬虫里面使用较为广泛的库,其主要功能是从网页中抓取数据,可以使用较少的代码实现完整的数据爬取工作。

BeautifulSoup简介

  BeautifulSoup提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。它是一个工具箱,通过解析文档为用户提供需要抓取的数据,因为简单,所以不需要多少代码就可以写出一个完整的应用程序。

pdfkit简介

  pdfkit是一个把HTML+CSS格式的文件转换成PDF格式文档的一种工具。它是对html转pdf工具包wkhtmltopdf的封装,所以必须要安装wkhtmltopdf,并将wkhtmltopdf的安装路径配置到系统环境PATH中。

可以参考:pdfkit与wkhtmltopdf的安装与使用

爬取的关键代码

  列举了一些共有变量和设置的变量

1
2
3
4
5
6
7
8
9
10
# 比如爬取python3,网址:http://www.runoob.com/python3/python3-tutorial.html
# 对该url进行拆分,对以下四个参数进行配置,一般只修改language
language = 'python3'
list_tag = '_top'
content_tag = 'content'
path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
# 以下参数不用配置
child_url = '/' + language
url_tag = child_url + '/'
output_name = u"runoob_" + language + r"教程.pdf"

  get_url_title_list方法:获取python的章节url

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def get_url_title_list():
"""
获取所有URL和Title目录列表
:return:
"""
root = root_url
temp_child_url = child_url
resp = requests.get(root + temp_child_url)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
x = soup.find("div", class_="design")
x = x.find_all("a", target=list_tag)
title = []
url_path = []
for i in x:
value = i.string.strip()
title.append(value)
temp_href = i.get('href').strip()
if temp_href.find(url_tag) >= 0:
href = root + temp_href
else:
href = root + temp_child_url + '/' + temp_href
url_path.append(href)
return title, url_path

  parse_url_to_html方法:将爬取的目标网页存储为html文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
:param url:解析的url
:param name: 保存的html文件名
:return: html
"""
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 正文
body = soup.find(id=content_tag)
# 标题
title = body.find('h1')
if title is None:
title = body.find_all('h2')[0]
title_tag = soup.new_tag('h1')
title_tag.string = title.string
title = title_tag
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag("center")
center_tag.insert(1, title)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
if m.group(2).find('https') >= 0:
return m.group(1) + m.group(2) + m.group(3)
if m.group(2).find('runoob.com') >= 0:
rtn = m.group(1) + "http:" + m.group(2) + m.group(3)
print(rtn)
else:
rtn = m.group(1) + root_url + m.group(2) + m.group(3)
print(rtn)
return rtn
else:
return m.group(1) + m.group(2) + m.group(3)
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
with open(name, 'wb') as f:
f.write(html)
return name
except Exception as e:
logging.error("解析错误", exc_info=True)
parse_url_to_html

  save_pdf方法:将html文件转为pdf文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def save_pdf(htmls, file_name):
"""
把所有html文件保存到pdf文件
:param htmls: html文件列表
:param file_name: pdf文件名
:return:
"""
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
pdfkit.from_file(htmls, file_name, options=options, configuration=config)

save_pdf

  append_pdf方法:拼接pdf

1
2
3
4
5
6
7
def append_pdf(input1, output1, bookmark):
bookmark_num = output1.getNumPages()
print(bookmark_num)
for page_num in range(input1.numPages):
output1.addPage(input1.getPage(page_num))

output1.addBookmark(bookmark, bookmark_num)

  在上述方法执行完成后,执行以下代码,将拼接的pdf输出到本地
1
output.write(open(output_name, "wb"))

append_pdf

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*-coding:utf-8-*-
import os
import re
import time
import sys
import logging
import pdfkit
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileWriter, PdfFileReader
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
root_url = "http://www.runoob.com"
# 针对Runoob.com的爬虫示例
# 列入要对该网站的python3进行爬虫,点开 python3,网址默认为:http://www.runoob.com/python3/python3-tutorial.html
# 对该url进行拆分,对以下四个参数进行配置,一般只修改language
language = 'python3' # 要爬虫的分类------针对不同的内容进行修改
list_tag = '_top' # 对左侧列表的class检索------一般不需要修改
content_tag = 'content' # 对正文的id检索------一般不需要修改
path_wkthmltopdf = r'C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe' # 本地 wkhtmltopdf 的配置------根据自己电脑情况配置
# 以下参数不用配置
child_url = '/' + language # 定义子url
url_tag = child_url + '/' # url_tag是对左侧列表中的url出现特殊情况下的判断
output_name = u"runoob_" + language + r"教程.pdf" # 爬虫的文件名
def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
:param url:解析的url
:param name: 保存的html文件名
:return: html
"""
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 正文
body = soup.find(id=content_tag)
# 标题
title = body.find('h1')
if title is None:
title = body.find_all('h2')[0]
title_tag = soup.new_tag('h1')
title_tag.string = title.string
title = title_tag
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag("center")
center_tag.insert(1, title)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
if m.group(2).find('https') >= 0:
return m.group(1) + m.group(2) + m.group(3)
if m.group(2).find('runoob.com') >= 0:
rtn = m.group(1) + "http:" + m.group(2) + m.group(3)
print(rtn)
else:
rtn = m.group(1) + root_url + m.group(2) + m.group(3)
print(rtn)
return rtn
else:
return m.group(1) + m.group(2) + m.group(3)
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
with open(name, 'wb') as f:
f.write(html)
return name
except Exception as e:
logging.error("解析错误", exc_info=True)
# 要去除所有的div标签
# 移除div(如果要移除a标签,把div换成a即可)
# remove_tag(html, "div")
def remove_tag(text, tag):
return text[:text.find("<" + tag + ">")] + text[text.find("</" + tag + ">") + len(tag) + 3:]
def get_url_title_list():
"""
获取所有URL和Title目录列表
:return:
"""
root = root_url
temp_child_url = child_url
resp = requests.get(root + temp_child_url)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
x = soup.find("div", class_="design")
x = x.find_all("a", target=list_tag)
title = []
url_path = []
for i in x:
value = i.string.strip()
title.append(value)
temp_href = i.get('href').strip()
if temp_href.find(url_tag) >= 0:
href = root + temp_href
else:
href = root + temp_child_url + '/' + temp_href
url_path.append(href)
return title, url_path
def save_pdf(htmls, file_name):
"""
把所有html文件保存到pdf文件
:param htmls: html文件列表
:param file_name: pdf文件名
:return:
"""
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
pdfkit.from_file(htmls, file_name, options=options, configuration=config)
def append_pdf(input1, output1, bookmark):
bookmark_num = output1.getNumPages()
print(bookmark_num)
for page_num in range(input1.numPages):
output1.addPage(input1.getPage(page_num))
output1.addBookmark(bookmark, bookmark_num)
def main():
output = PdfFileWriter()
start = time.time()
file_name = u"temp_"
result = get_url_title_list()
titles = result[0]
urls = result[1]
print(titles)
print(urls)
for index, url in enumerate(urls):
parse_url_to_html(url, str(index) + ".html")
htmls = []
pdfs = []
for i in range(0, len(urls)):
htmls.append(str(i) + '.html')
pdfs.append(file_name + str(i) + '.pdf')
save_pdf(str(i) + '.html', file_name + str(i) + '.pdf')
print(u"转换完成第" + str(i) + '个html')
i = 0
for pdf in pdfs:
fd = open(pdf, 'rb')
append_pdf(PdfFileReader(fd), output, titles[i])
i = i + 1
print(u"合并完成第" + str(i) + '个pdf' + pdf)
output.write(open(output_name, "wb"))
print(u"输出PDF成功!")
for html in htmls:
os.remove(html)
print(u"删除临时文件" + html)
for pdf in pdfs:
os.remove(pdf)
print(u"删除临时文件" + pdf)
total_time = time.time() - start
print(u"总共耗时:%f 秒" % total_time)
if __name__ == '__main__':
try:
main()
except OSError as err:
print("OS error: {0}".format(err))
except ValueError:
print("Could not convert data to an integer.")
finally:
print("Unexpected error:", sys.exc_info()[0])

存在的问题

  在删除中间生成的pdf时,报了一个错误(OS error: [WinError 32]另一个程序正在使用此文件,进程无法访问。:'temp_0.pdf'),提示文件正在使用中,删除pdf文件失败。由于正在学习python,在多次尝试解决该问题,都以失败告终,等技能提升之后再进行修复~
如果有不明白的,欢迎留言或者给我 发邮件[Send Email]


关注公众号

当前网速较慢或者你使用的浏览器不支持博客特定功能,请尝试刷新或换用Chrome、Firefox等现代浏览器