标题: 关于用socket实现一个超简单的爬虫 [打印本页]

作者: 19981998    时间: 2018-11-29 20:42
标题: 关于用socket实现一个超简单的爬虫
#仅供参考
import socket
import ssl
import re       
import os


url = "http://csse.xjtlu.edu.cn/classes/CSE205/"
protocol = url.split('://')[0]

u = url.split('://')[1]
i = u.find('/')
host = u[:i]
path = u[i:]
file_path='D:\\{}'.format(host)

def  get_html(url):

    if protocol =='https':
        sock= ssl.wrap_socket(socket.socket())
        port = 443

    else:
        sock = socket.socket()
        port = 80

    sock.connect((host,port))

    request = 'GET {} HTTP/1.1\r\nhost:{}\r\n\r\n'.format(path, host)
    print( request)

    sock.send(request.encode())
    response = b''
    buffer_size = 1024
    while True:
        r = sock.recv(buffer_size)
        response += r
        if len(r) < buffer_size:
            break

    response = response.decode()
    print(response)
    return response



def get_img(response):

    imgre = re.compile(r"""<img\s.*?\s?src\s*=\s*['|"]?([^\s'"]+).*?>""",re.I)
    imglist = re.findall(imgre,response)

    for src in imglist:

        if protocol =='https':
            sock2= ssl.wrap_socket(socket.socket())
            port = 443
        else:
            sock2= socket.socket()
            port = 80

        sock2.connect((host,port))
        request2='GET {}{} HTTP/1.1\r\nhost:{}\r\n\r\n'.format(path,src,host)
        print(request2)
        sock2.send(request2.encode())
        response2 = b''
        buffer_size = 1024
        while True:
            r = sock2.recv(buffer_size)
            response2 += r
            if len(r) < buffer_size:
                break
        data=response2.split("\r\n\r\n".encode())[1]

        src=src.replace('/','.');
        file_path='D:\\{}\{}'.format(host,src)

        with open (file_path,'wb') as f:
            f.write(data)


def mkdir(file_path):

    isExists=os.path.exists(file_path)
    if not isExists:
        os.makedirs(file_path)
        return True
    else:
        return False


mkdir(file_path)
response = get_html(url)
get_img(response)






欢迎光临 (http://www.51hei.com/bbs/) Powered by Discuz! X3.1