Python --爬虫小说学习-仅限于个人娱乐 娱乐项目记载:爬取网络上的小说娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。


前言

娱乐项目记载:爬取网络上的小说


一、演示

二、文件目录示意

三、使用步骤

1.引入库

代码如下(示例):

import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl

2.界面控制程序

main_pc.py:主要显示界面,消息发送,启动QT业务线程。

代码如下:

#_*_ coding:utf-8 _*_
'''
#1.获取书名
#2.获取链接和目录名
#3.获取内容
#4.保存内容'''
import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl
class LanFei_show_window(QMainWindow,Ui_MainWindow): # 继承至界面文件的主窗口类
 def __init__(self):
 super().__init__() # 使用超类,继承父类的属性及方法
 self.setupUi(self) # 构造窗体界面
 self.setWindowIcon(QIcon("./IMG/icon/icon.jpg"))
 self.setWindowTitle("测试使用") # 设置窗体主体
 self.initUI() # 构造功能函数
 
 def initUI(self):
 self.pushButton.clicked.connect(self.openurl)
 self.pushButton_2.clicked.connect(self.dowtext)
 self.lineEdit.setText("https://www.xtyxsw.org/read/280637/")
 def click_textbrowser(self):
 self.msg = os.getcwd()
 QDesktopServices.openUrl(QUrl.fromLocalFile(self.msg))
 # self.textBrowser.append("{}:{}".format(self.gettime(),"完成下载")) 
 
 def openurl(self):
 #此处添加功能函数
 geturl = self.lineEdit.text()
 print(geturl)
 print("打开网址:{}".format(geturl))
 if geturl != "":
 webbrowser.open(geturl)
 else:
 self.textBrowser.append("{}:请先输入网址路径! ".format(self.gettime())) 
 def gettime(self):
 # 获取当前时间
 time_show = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
 return time_show
 # 连接下载按钮
 def dowtext(self):
 geturl = self.lineEdit.text()
 # print(geturl)
 # print("下载数据:{}".format(geturl))
 if self.pushButton_2.text() == "下载":
 if geturl != "":
 self.test(geturl)
 self.pushButton_2.setText("停止")
 else:
 self.textBrowser.append("{}:请先输入网址路径! ".format(self.gettime())) 
 elif self.pushButton_2.text() == "停止":
 self.worker.change_ret()
 
 def test(self,url):
 # url = 'https://www.clewx.com/book/202011/29/11263.html'
 # url = "https://www.xtyxsw.org/read/130638/" # 神秘世界
 # url = "https://www.xtyxsw.org/read/293323/" # 开局囤积SSS级卡牌
 
 book_name = self.get_book_url(url)
 print("获取书名:" + book_name)
 self.textBrowser.append("{}:".format(self.gettime())+"获取书名--" + book_name)
 htmls_list,name_list = self.get_dir(url) #获取链接
 #print(htmls_list)
 #print(name_list)
 self.data = [book_name,name_list,htmls_list]
 # 创建工作线程的工作对象
 self.worker = Worker(msg=self.data)
 # 连接信号与槽
 self.worker.finished.connect(self.receive)
 self.worker.start()
 
 def get_url(self,url):
 hl = requests.get(url) # 获取源码
 hl = hl.content.decode("utf-8")
 return hl
 '''获取书名'''
 def get_book_url(self,url):
 #首先咱们调用模块然后解析这个网页
 selector = html.fromstring(self.get_url(url))
 # shumin = selector.xpath('//div[@class = "con_top"]/h1/text()')
 shumin = selector.xpath('/html/body/div[3]/div[2]/div/span/text()')
 # print("获取书名:" + str(shumin[0]))
 return shumin[0]
 def get_dir(self,url):
 '''获取链接和目录名'''
 htmls_list = [] #创建一个空列表来存储所有章节链接
 names_list = []
 hl = self.get_url(url)
 selector = html.fromstring(hl)
 html_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/@href') #获得链接列表
 name_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/text()')
 #将链接与网页网址连接,形成每一章的网址
 for i in html_list:
 shuju = str(i)
 htmls_list.append(shuju)
 for i in name_list:
 shuju = str(i)
 names_list.append(shuju)
 
 print("每章节链接:" + str(htmls_list) )
 print("每章节目录:" + str(names_list))
 print(len(names_list))
 return htmls_list,names_list
 def receive(self,text=[]):
 if text[0] == 1:
 self.textBrowser.append("{}:{} ".format(self.gettime(),text[1]))
 if text[0] == 2:
 self.textBrowser.append("{}:{} ".format(self.gettime(),text[1])) 
 if text[0] == 3:
 self.textBrowser.setOpenLinks(False)
 self.textBrowser.setOpenExternalLinks(False)
 # self.textBrowser.append("超链接测试" % ("完成下载"))
 self.textBrowser.append("{}:{}".format(self.gettime(),text[1])) 
 self.textBrowser.anchorClicked.connect(self.click_textbrowser) #连接函数
 self.pushButton_2.setText("下载")
if __name__ == "__main__":
 app = QApplication(sys.argv)
 ui2 = LanFei_show_window()
 ui2.show()
 sys.exit(app.exec_())

Ui_dowondstory.py:pyqt程序

代码如下:

# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'd:\pythonitem\爬虫小说\dowondstory.ui'
#
# Created by: PyQt5 UI code generator 5.15.11
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
 def setupUi(self, MainWindow):
 MainWindow.setObjectName("MainWindow")
 MainWindow.resize(579, 368)
 self.centralwidget = QtWidgets.QWidget(MainWindow)
 self.centralwidget.setObjectName("centralwidget")
 self.gridLayout = QtWidgets.QGridLayout(self.centralwidget)
 self.gridLayout.setObjectName("gridLayout")
 self.label = QtWidgets.QLabel(self.centralwidget)
 self.label.setObjectName("label")
 self.gridLayout.addWidget(self.label, 0, 0, 1, 1)
 self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
 self.lineEdit.setObjectName("lineEdit")
 self.gridLayout.addWidget(self.lineEdit, 0, 1, 1, 1)
 self.pushButton = QtWidgets.QPushButton(self.centralwidget)
 self.pushButton.setObjectName("pushButton")
 self.gridLayout.addWidget(self.pushButton, 0, 2, 1, 1)
 self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
 self.pushButton_2.setObjectName("pushButton_2")
 self.gridLayout.addWidget(self.pushButton_2, 0, 3, 1, 1)
 self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
 self.textBrowser.setObjectName("textBrowser")
 self.gridLayout.addWidget(self.textBrowser, 1, 0, 1, 4)
 MainWindow.setCentralWidget(self.centralwidget)
 self.menubar = QtWidgets.QMenuBar(MainWindow)
 self.menubar.setGeometry(QtCore.QRect(0, 0, 579, 23))
 self.menubar.setObjectName("menubar")
 MainWindow.setMenuBar(self.menubar)
 self.statusbar = QtWidgets.QStatusBar(MainWindow)
 self.statusbar.setObjectName("statusbar")
 MainWindow.setStatusBar(self.statusbar)
 self.retranslateUi(MainWindow)
 QtCore.QMetaObject.connectSlotsByName(MainWindow)
 def retranslateUi(self, MainWindow):
 _translate = QtCore.QCoreApplication.translate
 MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
 self.label.setText(_translate("MainWindow", "下载地址:"))
 self.pushButton.setText(_translate("MainWindow", "打开"))
 self.pushButton_2.setText(_translate("MainWindow", "下载"))

3.QT业务控制程序

pangchong.py:爬取章节小说的业务执行程序

代码如下:

import requests
import os
from lxml import html #调用lxml模块和requests模块
import time
import time
from PyQt5.QtCore import QThread,pyqtSignal
import threading
class Worker(QThread):
 finished = pyqtSignal(list)
 
 def __init__(self,msg=None):
 super().__init__()
 self.msg = msg
 self.ret = "True"
 def run(self):
 # 在这里执行耗时的操作
 for number in range(0,int(len(self.msg[1]))):
 if self.ret == "break":
 i = 2
 self.finished.emit([i,"已停止下载!"])
 break
 t1 = threading.Thread(target=self.save(self.msg[0],self.msg[1],self.msg[2],number))
 t1.start()
 t1.join()
 i = 3
 self.finished.emit([i,"完成下载!"])
 def change_ret(self):
 self.ret = "break"
 def get_url(self,url):
 hl = requests.get(url) # 获取源码
 hl = hl.content.decode("utf-8")
 return hl
 '''获取内容'''
 def get_neirong(self,htmls_list,number):
 url = htmls_list[number]
 # 主网址默认
 url = "https://www.xtyxsw.org" +url
 print("网址:" + url)
 txt = ""
 selector = html.fromstring(self.get_url(url))
 txt_list = selector.xpath('//div[@id="content"]/p/text()')
 #print(txt_list)
 liebiao = []
 for i in txt_list:
 i = i[0:]
 #print(i)
 liebiao.append(i)
 txts = selector.xpath('//a/text()') # // //div[@id="A3"]/a/text() //*[@id="A3"] 
 # print(txts)
 if "下一页" in txts:
 dizhi = selector.xpath('//a/@href')
 print(dizhi)
 url = "https://www.xtyxsw.org" + dizhi[-4]
 dizhi = html.fromstring(self.get_url(url))
 txt_lists = dizhi.xpath('//div[@id="content"]/p/text()')
 for i in txt_lists:
 i = i[0:]
 #print(i)
 liebiao.append(i)
 #print(liebiao) #打印内容
 return liebiao
 '''保存内容'''
 def save(self,book_name,name_list,htmls_list,number):
 path1 = os.getcwd()
 path = path1+"\\" + str(book_name)
 if os.path.isdir(path): #判断文件夹目录是否存在
 #print(str(path)+":文件夹已经存在!")
 pass
 else:
 os.mkdir(path)
 if number < 0:
 return
 liebiao = self.get_neirong(htmls_list,number)
 # print("文本内容:",liebiao)
 mulu = str(name_list[int(number)])
 mulu = mulu.replace("?","")
 paths = str(path) + "\\" + mulu +".txt"
 with open(paths,"w",encoding= "utf-8") as file:
 for wenzhi in liebiao:
 file.write(wenzhi +"\n")
 print("完成第" + str(int(number)+1) + "章写入!")
 i=1
 h= "完成第" + str(int(number)+1) + "章写入!"
 self.finished.emit([i,h])
 time.sleep(0.5)
 def finisheds(self,i,h=None):
 self.finished.emit([i,h])

4.批量修改文件名称

xiugainame.py:将汉数字皆转化为阿拉伯数字

修改前和修改后的显示图片

>xiugainame.py:修改文件名称程序

代码如下:

import os
'''修改文件名称'''
path = "./末日重生:开局囤积SSS级卡牌小说"
files = os.listdir(path)
print(files)
liebiao1 = ["零","一","二","三","四","五","六","七","八","九"]
liebiao2 = ["十","百","千"]
liebiao3 = ["0","1","2","3","4","5","6","7","8","9"]
for shuju in files:
 new_name = []
 for name in shuju:
 jishu=1
 if name in liebiao1:
 print(name)
 print(liebiao1.index(name))
 shuzhi = liebiao1.index(name) 
 changnumber = liebiao3[shuzhi]
 new_name.append(changnumber)
 elif name in liebiao2:
 if shuju[1] == "十" and shuju[2] == "章":
 new_name.append("10")
 if shuju[1] == "十" and shuju[2] != "章":
 new_name.append("1")
 if shuju[2] == "十" and shuju[3] == "章":
 new_name.append("0")
 if shuju[2] == "百" and shuju[3] == "章":
 new_name.append("00")
 if shuju[2] == "百" and shuju[5] == "章":
 new_name.append("0")
 else:
 new_name.append(name)
 jishu += 1
 
 print(new_name)
 combined_string = ''
 for string in new_name:
 combined_string += string
 print(combined_string)
 # 源文件路径
 old_path = path + "/" + shuju
 # 新文件名
 new_name = path + "/" + combined_string
 # 修改文件名
 try:
 os.rename(old_path, new_name)
 except FileNotFoundError:
 print("源文件未找到")
 except PermissionError:
 print("权限不足,无法修改文件名")

总结

娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。

作者:山中坐原文地址:https://blog.csdn.net/Offivensive888/article/details/142956791

%s 个评论

要回复文章请先登录注册